handle unicode in source code

Summary: public Should address #175. Reviewed By: akotulski Differential Revision: D2641689 fb-gh-sync-id: e98da88
10 years ago · ae81d8d215
parent 0cd533f892
commit ae81d8d215
5 changed files with 49 additions and 24 deletions
--- a/infer/lib/python/BuckAnalyze
+++ b/infer/lib/python/BuckAnalyze
@ -338,7 +338,7 @@ def load_stats(opened_jar):
 def load_csv_report(opened_jar):
    try:
        sio = io.StringIO(opened_jar.read(INFER_CSV_REPORT).decode())
-        return list(csv.reader(sio))
+        return list(utils.locale_csv_reader(sio))
    except KeyError as e:
        raise NotFoundInJar

--- a/infer/lib/python/infer
+++ b/infer/lib/python/infer
@ -68,7 +68,9 @@ def load_module(mod_name):
 def split_args_to_parse():
    dd_index = \
        sys.argv.index(CMD_MARKER) if CMD_MARKER in sys.argv else len(sys.argv)
-    return sys.argv[1:dd_index], sys.argv[dd_index + 1:]
+    cmd_raw = sys.argv[dd_index + 1:]
+    return (sys.argv[1:dd_index],
+            [arg.decode(utils.LOCALE) for arg in cmd_raw])


 def create_argparser(parents=[]):
@ -168,7 +170,8 @@ def main():
        bugs_filename = os.path.join(args.infer_out,
                                     utils.JSON_REPORT_FILENAME)
        try:
-            with open(bugs_filename) as bugs_file:
+            with codecs.open(bugs_filename, 'r',
+                             encoding=utils.LOCALE) as bugs_file:
                bugs = json.load(bugs_file)
                if len(bugs) > 0:
                    sys.exit(analyze.BUG_FOUND_ERROR_CODE)
--- a/infer/lib/python/inferTraceBugs
+++ b/infer/lib/python/inferTraceBugs
@ -355,13 +355,15 @@ def generate_html_report(args, reports):
    i = 0
    for bug in sel:
        bug_trace_path = path_of_bug_number(traces_dir, i)
-        with open(bug_trace_path, 'w') as bug_trace_file:
+        with codecs.open(bug_trace_path, 'w',
+                         encoding=utils.LOCALE) as bug_trace_file:
            bug_trace_file.write(html_bug_trace(args, bug, i))
        i += 1

    remote_source_template = get_remote_source_template()
    bug_list_path = os.path.join(html_dir, 'index.html')
-    with open(bug_list_path, 'w') as bug_list_file:
+    with codecs.open(bug_list_path, 'w',
+                     encoding=utils.LOCALE) as bug_list_file:
        bug_list_file.write(html_list_of_bugs(args,
                                              remote_source_template,
                                              sel))
@ -373,7 +375,8 @@ def main():
    args = base_parser.parse_args()

    report_filename = os.path.join(args.infer_out, utils.JSON_REPORT_FILENAME)
-    with open(report_filename) as report_file:
+    with codecs.open(report_filename, 'r',
+                     encoding=utils.LOCALE) as report_file:
        reports = json.load(report_file)

    if args.html:
--- a/infer/lib/python/inferlib/analyze.py
+++ b/infer/lib/python/inferlib/analyze.py
@ -11,6 +11,7 @@ from __future__ import print_function
 from __future__ import unicode_literals

 import argparse
+import codecs
 import csv
 import glob
 import json
@ -220,7 +221,6 @@ def create_results_dir(results_dir):


 def clean(infer_out):
-
    directories = ['multicore', 'classnames', 'sources', jwlib.FILELISTS]
    extensions = ['.cfg', '.cg']

@ -346,7 +346,7 @@ def should_report_json(analyzer, row):

 def clean_json(args, json_report):
    collected_rows = []
-    with open(json_report, 'r') as file_in:
+    with codecs.open(json_report, 'r', encoding=utils.LOCALE) as file_in:
        rows = json.load(file_in)
        for row in rows:
            filename = row[utils.JSON_INDEX_FILENAME]
@ -357,7 +357,7 @@ def clean_json(args, json_report):
            collected_rows,
            cmp=compare_json_rows)
    temporary_file = tempfile.mktemp()
-    with open(temporary_file, 'w') as file_out:
+    with codecs.open(temporary_file, 'w', encoding=utils.LOCALE) as file_out:
        json.dump(collected_rows, file_out)
        file_out.flush()
        shutil.move(temporary_file, json_report)
@ -366,7 +366,7 @@ def clean_json(args, json_report):
 def clean_csv(args, csv_report):
    collected_rows = []
    with open(csv_report, 'r') as file_in:
-        reader = csv.reader(file_in)
+        reader = utils.locale_csv_reader(file_in)
        rows = [row for row in reader]
        if len(rows) <= 1:
            return rows
@ -395,8 +395,8 @@ def print_and_write(file_out, message):


 def print_errors(csv_report, bugs_out):
-    with open(csv_report, 'r') as file_in:
-        reader = csv.reader(file_in)
+    with codecs.open(csv_report, 'r', encoding=utils.LOCALE) as file_in:
+        reader = utils.locale_csv_reader(file_in)
        reader.next()  # first line is header, skip it

        errors = filter(
@ -404,7 +404,7 @@ def print_errors(csv_report, bugs_out):
            reader
        )

-        with open(bugs_out, 'w') as file_out:
+        with codecs.open(bugs_out, 'w', encoding=utils.LOCALE) as file_out:
            text_errors_list = []
            for row in errors:
                filename = row[utils.CSV_INDEX_FILENAME]
@ -419,9 +419,9 @@ def print_errors(csv_report, bugs_out):
                        utils.build_source_context(filename,
                                                   utils.TERMINAL_FORMATTER,
                                                   int(line)))
-                    source_context = str(indenter)
+                    source_context = unicode(indenter)
                    text_errors_list.append(
-                        '{0}:{1}: {2}: {3}\n  {4}\n{5}'.format(
+                        u'{0}:{1}: {2}: {3}\n  {4}\n{5}'.format(
                            filename,
                            line,
                            kind.lower(),
@ -674,7 +674,7 @@ class Infer:

    def update_stats_with_warnings(self, csv_report):
        with open(csv_report, 'r') as file_in:
-            reader = csv.reader(file_in)
+            reader = utils.locale_csv_reader(file_in)
            rows = [row for row in reader][1:]
            for row in rows:
                key = row[utils.CSV_INDEX_TYPE]
@ -722,7 +722,8 @@ class Infer:

        # capture and compile mode do not create proc_stats.json
        if os.path.isfile(proc_stats_path):
-            with open(proc_stats_path, 'r') as proc_stats_file:
+            with codecs.open(proc_stats_path, 'r',
+                             encoding=utils.LOCALE) as proc_stats_file:
                proc_stats = json.load(proc_stats_file)
                self.stats['int'].update(proc_stats)

@ -741,7 +742,7 @@ class Infer:
        }

        stats_path = os.path.join(self.args.infer_out, utils.STATS_FILENAME)
-        with open(stats_path, 'w') as stats_file:
+        with codecs.open(stats_path, 'w', encoding=utils.LOCALE) as stats_file:
            json.dump(self.stats, stats_file, indent=2)


--- a/infer/lib/python/inferlib/utils.py
+++ b/infer/lib/python/inferlib/utils.py
@ -11,10 +11,12 @@ from __future__ import print_function
 from __future__ import unicode_literals

 import argparse
+import codecs
 import csv
 import fnmatch
 import gzip
 import json
+import locale
 import logging
 import os
 import re
@ -30,9 +32,12 @@ import tempfile
 import time


+LOCALE = locale.getpreferredencoding()
+
 # this assumes that this file lives in infer/lib/python/infer/ and the binaries
 # are in infer/bin/
-INFER_PYTHON_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
+INFER_PYTHON_DIRECTORY = os.path.dirname(os.path.realpath(__file__)
+                                         .decode(LOCALE))
 INFER_INFER_DIRECTORY = os.path.join(INFER_PYTHON_DIRECTORY,
                                     os.pardir, os.pardir, os.pardir)
 INFER_ROOT_DIRECTORY = os.path.join(INFER_INFER_DIRECTORY, os.pardir)
@ -46,7 +51,7 @@ ANNOT_PROCESSOR_JAR = os.path.join(JAVA_LIB_DIRECTORY, 'processor.jar')
 WRAPPERS_DIRECTORY = os.path.join(LIB_DIRECTORY, 'wrappers')
 XCODE_WRAPPERS_DIRECTORY = os.path.join(LIB_DIRECTORY, 'xcode_wrappers')

-DEFAULT_INFER_OUT = os.path.join(os.getcwd(), 'infer-out')
+DEFAULT_INFER_OUT = os.path.join(os.getcwd().decode(LOCALE), 'infer-out')
 CSV_PERF_FILENAME = 'performances.csv'
 STATS_FILENAME = 'stats.json'
 PROC_STATS_FILENAME = 'proc_stats.json'
@ -132,6 +137,14 @@ if "check_output" not in dir(subprocess):
    subprocess.check_output = f


+# csv.reader() doesn't support utf-8. Do not use csv.reader(). Use
+# this instead.
+def locale_csv_reader(iterable, dialect='excel', **kwargs):
+    rows = csv.reader(iterable, dialect=dialect, **kwargs)
+    for row in rows:
+        yield [unicode(cell, LOCALE) for cell in row]
+
+
 def configure_logging(debug, quiet=False):
    """Configures the default logger. This can be called only once and has to
    be called before any logging is done.
@ -164,7 +177,7 @@ def get_cmd_in_bin_dir(binary_name):


 def write_cmd_streams_to_file(logfile, cmd=None, out=None, err=None):
-    with open(logfile, 'w') as log_filedesc:
+    with codecs.open(logfile, 'w', encoding=LOCALE) as log_filedesc:
        if cmd:
            log_filedesc.write(' '.join(cmd) + '\n')
        if err is not None:
@ -424,14 +437,19 @@ class Indenter(str):
        self.text += '\n'

    def add(self, x):
+        if type(x) != unicode:
+            x = x.decode(LOCALE)
        lines = x.splitlines()
        indent = self.indent_get()
        lines = [indent + l for l in lines]
        self.text += '\n'.join(lines)

-    def __str__(self):
+    def __unicode__(self):
        return self.text

+    def __str__(self):
+        return unicode(self).encode(LOCALE)
+

 def syntax_highlighting(source_name, mode, s):
    if pygments is None or mode == PLAIN_FORMATTER:
@ -454,14 +472,14 @@ def build_source_context(source_name, mode, report_line):
    n_length = len(str(end_line))
    line_number = 1
    s = ''
-    with open(source_name) as source_file:
+    with codecs.open(source_name, 'r', encoding=LOCALE) as source_file:
        for line in source_file:
            if start_line <= line_number <= end_line:
                num = str(line_number).zfill(n_length)
                caret = '  '
                if line_number == report_line:
                    caret = '> '
-                s += num + '. ' + caret + line
+                s += u'%s. %s%s' % (num, caret, line)
            line_number += 1
    return syntax_highlighting(source_name, mode, s)