From ae81d8d215aa26cb41ebb57ab2b3d0f340d5a0a5 Mon Sep 17 00:00:00 2001 From: Jules Villard Date: Wed, 11 Nov 2015 07:45:21 -0800 Subject: [PATCH] handle unicode in source code Summary: public Should address #175. Reviewed By: akotulski Differential Revision: D2641689 fb-gh-sync-id: e98da88 --- infer/lib/python/BuckAnalyze | 2 +- infer/lib/python/infer | 7 +++++-- infer/lib/python/inferTraceBugs | 9 ++++++--- infer/lib/python/inferlib/analyze.py | 25 ++++++++++++----------- infer/lib/python/inferlib/utils.py | 30 ++++++++++++++++++++++------ 5 files changed, 49 insertions(+), 24 deletions(-) diff --git a/infer/lib/python/BuckAnalyze b/infer/lib/python/BuckAnalyze index 2f67bfda0..fcd20240e 100755 --- a/infer/lib/python/BuckAnalyze +++ b/infer/lib/python/BuckAnalyze @@ -338,7 +338,7 @@ def load_stats(opened_jar): def load_csv_report(opened_jar): try: sio = io.StringIO(opened_jar.read(INFER_CSV_REPORT).decode()) - return list(csv.reader(sio)) + return list(utils.locale_csv_reader(sio)) except KeyError as e: raise NotFoundInJar diff --git a/infer/lib/python/infer b/infer/lib/python/infer index c0a03fcc2..97663e06a 100755 --- a/infer/lib/python/infer +++ b/infer/lib/python/infer @@ -68,7 +68,9 @@ def load_module(mod_name): def split_args_to_parse(): dd_index = \ sys.argv.index(CMD_MARKER) if CMD_MARKER in sys.argv else len(sys.argv) - return sys.argv[1:dd_index], sys.argv[dd_index + 1:] + cmd_raw = sys.argv[dd_index + 1:] + return (sys.argv[1:dd_index], + [arg.decode(utils.LOCALE) for arg in cmd_raw]) def create_argparser(parents=[]): @@ -168,7 +170,8 @@ def main(): bugs_filename = os.path.join(args.infer_out, utils.JSON_REPORT_FILENAME) try: - with open(bugs_filename) as bugs_file: + with codecs.open(bugs_filename, 'r', + encoding=utils.LOCALE) as bugs_file: bugs = json.load(bugs_file) if len(bugs) > 0: sys.exit(analyze.BUG_FOUND_ERROR_CODE) diff --git a/infer/lib/python/inferTraceBugs b/infer/lib/python/inferTraceBugs index e6d4e01a9..363046d1f 100755 --- a/infer/lib/python/inferTraceBugs +++ b/infer/lib/python/inferTraceBugs @@ -355,13 +355,15 @@ def generate_html_report(args, reports): i = 0 for bug in sel: bug_trace_path = path_of_bug_number(traces_dir, i) - with open(bug_trace_path, 'w') as bug_trace_file: + with codecs.open(bug_trace_path, 'w', + encoding=utils.LOCALE) as bug_trace_file: bug_trace_file.write(html_bug_trace(args, bug, i)) i += 1 remote_source_template = get_remote_source_template() bug_list_path = os.path.join(html_dir, 'index.html') - with open(bug_list_path, 'w') as bug_list_file: + with codecs.open(bug_list_path, 'w', + encoding=utils.LOCALE) as bug_list_file: bug_list_file.write(html_list_of_bugs(args, remote_source_template, sel)) @@ -373,7 +375,8 @@ def main(): args = base_parser.parse_args() report_filename = os.path.join(args.infer_out, utils.JSON_REPORT_FILENAME) - with open(report_filename) as report_file: + with codecs.open(report_filename, 'r', + encoding=utils.LOCALE) as report_file: reports = json.load(report_file) if args.html: diff --git a/infer/lib/python/inferlib/analyze.py b/infer/lib/python/inferlib/analyze.py index 4ff07ca5f..721e8a0d4 100644 --- a/infer/lib/python/inferlib/analyze.py +++ b/infer/lib/python/inferlib/analyze.py @@ -11,6 +11,7 @@ from __future__ import print_function from __future__ import unicode_literals import argparse +import codecs import csv import glob import json @@ -220,7 +221,6 @@ def create_results_dir(results_dir): def clean(infer_out): - directories = ['multicore', 'classnames', 'sources', jwlib.FILELISTS] extensions = ['.cfg', '.cg'] @@ -346,7 +346,7 @@ def should_report_json(analyzer, row): def clean_json(args, json_report): collected_rows = [] - with open(json_report, 'r') as file_in: + with codecs.open(json_report, 'r', encoding=utils.LOCALE) as file_in: rows = json.load(file_in) for row in rows: filename = row[utils.JSON_INDEX_FILENAME] @@ -357,7 +357,7 @@ def clean_json(args, json_report): collected_rows, cmp=compare_json_rows) temporary_file = tempfile.mktemp() - with open(temporary_file, 'w') as file_out: + with codecs.open(temporary_file, 'w', encoding=utils.LOCALE) as file_out: json.dump(collected_rows, file_out) file_out.flush() shutil.move(temporary_file, json_report) @@ -366,7 +366,7 @@ def clean_json(args, json_report): def clean_csv(args, csv_report): collected_rows = [] with open(csv_report, 'r') as file_in: - reader = csv.reader(file_in) + reader = utils.locale_csv_reader(file_in) rows = [row for row in reader] if len(rows) <= 1: return rows @@ -395,8 +395,8 @@ def print_and_write(file_out, message): def print_errors(csv_report, bugs_out): - with open(csv_report, 'r') as file_in: - reader = csv.reader(file_in) + with codecs.open(csv_report, 'r', encoding=utils.LOCALE) as file_in: + reader = utils.locale_csv_reader(file_in) reader.next() # first line is header, skip it errors = filter( @@ -404,7 +404,7 @@ def print_errors(csv_report, bugs_out): reader ) - with open(bugs_out, 'w') as file_out: + with codecs.open(bugs_out, 'w', encoding=utils.LOCALE) as file_out: text_errors_list = [] for row in errors: filename = row[utils.CSV_INDEX_FILENAME] @@ -419,9 +419,9 @@ def print_errors(csv_report, bugs_out): utils.build_source_context(filename, utils.TERMINAL_FORMATTER, int(line))) - source_context = str(indenter) + source_context = unicode(indenter) text_errors_list.append( - '{0}:{1}: {2}: {3}\n {4}\n{5}'.format( + u'{0}:{1}: {2}: {3}\n {4}\n{5}'.format( filename, line, kind.lower(), @@ -674,7 +674,7 @@ class Infer: def update_stats_with_warnings(self, csv_report): with open(csv_report, 'r') as file_in: - reader = csv.reader(file_in) + reader = utils.locale_csv_reader(file_in) rows = [row for row in reader][1:] for row in rows: key = row[utils.CSV_INDEX_TYPE] @@ -722,7 +722,8 @@ class Infer: # capture and compile mode do not create proc_stats.json if os.path.isfile(proc_stats_path): - with open(proc_stats_path, 'r') as proc_stats_file: + with codecs.open(proc_stats_path, 'r', + encoding=utils.LOCALE) as proc_stats_file: proc_stats = json.load(proc_stats_file) self.stats['int'].update(proc_stats) @@ -741,7 +742,7 @@ class Infer: } stats_path = os.path.join(self.args.infer_out, utils.STATS_FILENAME) - with open(stats_path, 'w') as stats_file: + with codecs.open(stats_path, 'w', encoding=utils.LOCALE) as stats_file: json.dump(self.stats, stats_file, indent=2) diff --git a/infer/lib/python/inferlib/utils.py b/infer/lib/python/inferlib/utils.py index 636e76851..a42481b21 100644 --- a/infer/lib/python/inferlib/utils.py +++ b/infer/lib/python/inferlib/utils.py @@ -11,10 +11,12 @@ from __future__ import print_function from __future__ import unicode_literals import argparse +import codecs import csv import fnmatch import gzip import json +import locale import logging import os import re @@ -30,9 +32,12 @@ import tempfile import time +LOCALE = locale.getpreferredencoding() + # this assumes that this file lives in infer/lib/python/infer/ and the binaries # are in infer/bin/ -INFER_PYTHON_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) +INFER_PYTHON_DIRECTORY = os.path.dirname(os.path.realpath(__file__) + .decode(LOCALE)) INFER_INFER_DIRECTORY = os.path.join(INFER_PYTHON_DIRECTORY, os.pardir, os.pardir, os.pardir) INFER_ROOT_DIRECTORY = os.path.join(INFER_INFER_DIRECTORY, os.pardir) @@ -46,7 +51,7 @@ ANNOT_PROCESSOR_JAR = os.path.join(JAVA_LIB_DIRECTORY, 'processor.jar') WRAPPERS_DIRECTORY = os.path.join(LIB_DIRECTORY, 'wrappers') XCODE_WRAPPERS_DIRECTORY = os.path.join(LIB_DIRECTORY, 'xcode_wrappers') -DEFAULT_INFER_OUT = os.path.join(os.getcwd(), 'infer-out') +DEFAULT_INFER_OUT = os.path.join(os.getcwd().decode(LOCALE), 'infer-out') CSV_PERF_FILENAME = 'performances.csv' STATS_FILENAME = 'stats.json' PROC_STATS_FILENAME = 'proc_stats.json' @@ -132,6 +137,14 @@ if "check_output" not in dir(subprocess): subprocess.check_output = f +# csv.reader() doesn't support utf-8. Do not use csv.reader(). Use +# this instead. +def locale_csv_reader(iterable, dialect='excel', **kwargs): + rows = csv.reader(iterable, dialect=dialect, **kwargs) + for row in rows: + yield [unicode(cell, LOCALE) for cell in row] + + def configure_logging(debug, quiet=False): """Configures the default logger. This can be called only once and has to be called before any logging is done. @@ -164,7 +177,7 @@ def get_cmd_in_bin_dir(binary_name): def write_cmd_streams_to_file(logfile, cmd=None, out=None, err=None): - with open(logfile, 'w') as log_filedesc: + with codecs.open(logfile, 'w', encoding=LOCALE) as log_filedesc: if cmd: log_filedesc.write(' '.join(cmd) + '\n') if err is not None: @@ -424,14 +437,19 @@ class Indenter(str): self.text += '\n' def add(self, x): + if type(x) != unicode: + x = x.decode(LOCALE) lines = x.splitlines() indent = self.indent_get() lines = [indent + l for l in lines] self.text += '\n'.join(lines) - def __str__(self): + def __unicode__(self): return self.text + def __str__(self): + return unicode(self).encode(LOCALE) + def syntax_highlighting(source_name, mode, s): if pygments is None or mode == PLAIN_FORMATTER: @@ -454,14 +472,14 @@ def build_source_context(source_name, mode, report_line): n_length = len(str(end_line)) line_number = 1 s = '' - with open(source_name) as source_file: + with codecs.open(source_name, 'r', encoding=LOCALE) as source_file: for line in source_file: if start_line <= line_number <= end_line: num = str(line_number).zfill(n_length) caret = ' ' if line_number == report_line: caret = '> ' - s += num + '. ' + caret + line + s += u'%s. %s%s' % (num, caret, line) line_number += 1 return syntax_highlighting(source_name, mode, s)