From ae81d8d215aa26cb41ebb57ab2b3d0f340d5a0a5 Mon Sep 17 00:00:00 2001
From: Jules Villard <jul@fb.com>
Date: Wed, 11 Nov 2015 07:45:21 -0800
Subject: [PATCH] handle unicode in source code

Summary: public
Should address #175.

Reviewed By: akotulski

Differential Revision: D2641689

fb-gh-sync-id: e98da88
---
 infer/lib/python/BuckAnalyze         |  2 +-
 infer/lib/python/infer               |  7 +++++--
 infer/lib/python/inferTraceBugs      |  9 ++++++---
 infer/lib/python/inferlib/analyze.py | 25 ++++++++++++-----------
 infer/lib/python/inferlib/utils.py   | 30 ++++++++++++++++++++++------
 5 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/infer/lib/python/BuckAnalyze b/infer/lib/python/BuckAnalyze
index 2f67bfda0..fcd20240e 100755
--- a/infer/lib/python/BuckAnalyze
+++ b/infer/lib/python/BuckAnalyze
@@ -338,7 +338,7 @@ def load_stats(opened_jar):
 def load_csv_report(opened_jar):
     try:
         sio = io.StringIO(opened_jar.read(INFER_CSV_REPORT).decode())
-        return list(csv.reader(sio))
+        return list(utils.locale_csv_reader(sio))
     except KeyError as e:
         raise NotFoundInJar
 
diff --git a/infer/lib/python/infer b/infer/lib/python/infer
index c0a03fcc2..97663e06a 100755
--- a/infer/lib/python/infer
+++ b/infer/lib/python/infer
@@ -68,7 +68,9 @@ def load_module(mod_name):
 def split_args_to_parse():
     dd_index = \
         sys.argv.index(CMD_MARKER) if CMD_MARKER in sys.argv else len(sys.argv)
-    return sys.argv[1:dd_index], sys.argv[dd_index + 1:]
+    cmd_raw = sys.argv[dd_index + 1:]
+    return (sys.argv[1:dd_index],
+            [arg.decode(utils.LOCALE) for arg in cmd_raw])
 
 
 def create_argparser(parents=[]):
@@ -168,7 +170,8 @@ def main():
         bugs_filename = os.path.join(args.infer_out,
                                      utils.JSON_REPORT_FILENAME)
         try:
-            with open(bugs_filename) as bugs_file:
+            with codecs.open(bugs_filename, 'r',
+                             encoding=utils.LOCALE) as bugs_file:
                 bugs = json.load(bugs_file)
                 if len(bugs) > 0:
                     sys.exit(analyze.BUG_FOUND_ERROR_CODE)
diff --git a/infer/lib/python/inferTraceBugs b/infer/lib/python/inferTraceBugs
index e6d4e01a9..363046d1f 100755
--- a/infer/lib/python/inferTraceBugs
+++ b/infer/lib/python/inferTraceBugs
@@ -355,13 +355,15 @@ def generate_html_report(args, reports):
     i = 0
     for bug in sel:
         bug_trace_path = path_of_bug_number(traces_dir, i)
-        with open(bug_trace_path, 'w') as bug_trace_file:
+        with codecs.open(bug_trace_path, 'w',
+                         encoding=utils.LOCALE) as bug_trace_file:
             bug_trace_file.write(html_bug_trace(args, bug, i))
         i += 1
 
     remote_source_template = get_remote_source_template()
     bug_list_path = os.path.join(html_dir, 'index.html')
-    with open(bug_list_path, 'w') as bug_list_file:
+    with codecs.open(bug_list_path, 'w',
+                     encoding=utils.LOCALE) as bug_list_file:
         bug_list_file.write(html_list_of_bugs(args,
                                               remote_source_template,
                                               sel))
@@ -373,7 +375,8 @@ def main():
     args = base_parser.parse_args()
 
     report_filename = os.path.join(args.infer_out, utils.JSON_REPORT_FILENAME)
-    with open(report_filename) as report_file:
+    with codecs.open(report_filename, 'r',
+                     encoding=utils.LOCALE) as report_file:
         reports = json.load(report_file)
 
     if args.html:
diff --git a/infer/lib/python/inferlib/analyze.py b/infer/lib/python/inferlib/analyze.py
index 4ff07ca5f..721e8a0d4 100644
--- a/infer/lib/python/inferlib/analyze.py
+++ b/infer/lib/python/inferlib/analyze.py
@@ -11,6 +11,7 @@ from __future__ import print_function
 from __future__ import unicode_literals
 
 import argparse
+import codecs
 import csv
 import glob
 import json
@@ -220,7 +221,6 @@ def create_results_dir(results_dir):
 
 
 def clean(infer_out):
-
     directories = ['multicore', 'classnames', 'sources', jwlib.FILELISTS]
     extensions = ['.cfg', '.cg']
 
@@ -346,7 +346,7 @@ def should_report_json(analyzer, row):
 
 def clean_json(args, json_report):
     collected_rows = []
-    with open(json_report, 'r') as file_in:
+    with codecs.open(json_report, 'r', encoding=utils.LOCALE) as file_in:
         rows = json.load(file_in)
         for row in rows:
             filename = row[utils.JSON_INDEX_FILENAME]
@@ -357,7 +357,7 @@ def clean_json(args, json_report):
             collected_rows,
             cmp=compare_json_rows)
     temporary_file = tempfile.mktemp()
-    with open(temporary_file, 'w') as file_out:
+    with codecs.open(temporary_file, 'w', encoding=utils.LOCALE) as file_out:
         json.dump(collected_rows, file_out)
         file_out.flush()
         shutil.move(temporary_file, json_report)
@@ -366,7 +366,7 @@ def clean_json(args, json_report):
 def clean_csv(args, csv_report):
     collected_rows = []
     with open(csv_report, 'r') as file_in:
-        reader = csv.reader(file_in)
+        reader = utils.locale_csv_reader(file_in)
         rows = [row for row in reader]
         if len(rows) <= 1:
             return rows
@@ -395,8 +395,8 @@ def print_and_write(file_out, message):
 
 
 def print_errors(csv_report, bugs_out):
-    with open(csv_report, 'r') as file_in:
-        reader = csv.reader(file_in)
+    with codecs.open(csv_report, 'r', encoding=utils.LOCALE) as file_in:
+        reader = utils.locale_csv_reader(file_in)
         reader.next()  # first line is header, skip it
 
         errors = filter(
@@ -404,7 +404,7 @@ def print_errors(csv_report, bugs_out):
             reader
         )
 
-        with open(bugs_out, 'w') as file_out:
+        with codecs.open(bugs_out, 'w', encoding=utils.LOCALE) as file_out:
             text_errors_list = []
             for row in errors:
                 filename = row[utils.CSV_INDEX_FILENAME]
@@ -419,9 +419,9 @@ def print_errors(csv_report, bugs_out):
                         utils.build_source_context(filename,
                                                    utils.TERMINAL_FORMATTER,
                                                    int(line)))
-                    source_context = str(indenter)
+                    source_context = unicode(indenter)
                     text_errors_list.append(
-                        '{0}:{1}: {2}: {3}\n  {4}\n{5}'.format(
+                        u'{0}:{1}: {2}: {3}\n  {4}\n{5}'.format(
                             filename,
                             line,
                             kind.lower(),
@@ -674,7 +674,7 @@ class Infer:
 
     def update_stats_with_warnings(self, csv_report):
         with open(csv_report, 'r') as file_in:
-            reader = csv.reader(file_in)
+            reader = utils.locale_csv_reader(file_in)
             rows = [row for row in reader][1:]
             for row in rows:
                 key = row[utils.CSV_INDEX_TYPE]
@@ -722,7 +722,8 @@ class Infer:
 
         # capture and compile mode do not create proc_stats.json
         if os.path.isfile(proc_stats_path):
-            with open(proc_stats_path, 'r') as proc_stats_file:
+            with codecs.open(proc_stats_path, 'r',
+                             encoding=utils.LOCALE) as proc_stats_file:
                 proc_stats = json.load(proc_stats_file)
                 self.stats['int'].update(proc_stats)
 
@@ -741,7 +742,7 @@ class Infer:
         }
 
         stats_path = os.path.join(self.args.infer_out, utils.STATS_FILENAME)
-        with open(stats_path, 'w') as stats_file:
+        with codecs.open(stats_path, 'w', encoding=utils.LOCALE) as stats_file:
             json.dump(self.stats, stats_file, indent=2)
 
 
diff --git a/infer/lib/python/inferlib/utils.py b/infer/lib/python/inferlib/utils.py
index 636e76851..a42481b21 100644
--- a/infer/lib/python/inferlib/utils.py
+++ b/infer/lib/python/inferlib/utils.py
@@ -11,10 +11,12 @@ from __future__ import print_function
 from __future__ import unicode_literals
 
 import argparse
+import codecs
 import csv
 import fnmatch
 import gzip
 import json
+import locale
 import logging
 import os
 import re
@@ -30,9 +32,12 @@ import tempfile
 import time
 
 
+LOCALE = locale.getpreferredencoding()
+
 # this assumes that this file lives in infer/lib/python/infer/ and the binaries
 # are in infer/bin/
-INFER_PYTHON_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
+INFER_PYTHON_DIRECTORY = os.path.dirname(os.path.realpath(__file__)
+                                         .decode(LOCALE))
 INFER_INFER_DIRECTORY = os.path.join(INFER_PYTHON_DIRECTORY,
                                      os.pardir, os.pardir, os.pardir)
 INFER_ROOT_DIRECTORY = os.path.join(INFER_INFER_DIRECTORY, os.pardir)
@@ -46,7 +51,7 @@ ANNOT_PROCESSOR_JAR = os.path.join(JAVA_LIB_DIRECTORY, 'processor.jar')
 WRAPPERS_DIRECTORY = os.path.join(LIB_DIRECTORY, 'wrappers')
 XCODE_WRAPPERS_DIRECTORY = os.path.join(LIB_DIRECTORY, 'xcode_wrappers')
 
-DEFAULT_INFER_OUT = os.path.join(os.getcwd(), 'infer-out')
+DEFAULT_INFER_OUT = os.path.join(os.getcwd().decode(LOCALE), 'infer-out')
 CSV_PERF_FILENAME = 'performances.csv'
 STATS_FILENAME = 'stats.json'
 PROC_STATS_FILENAME = 'proc_stats.json'
@@ -132,6 +137,14 @@ if "check_output" not in dir(subprocess):
     subprocess.check_output = f
 
 
+# csv.reader() doesn't support utf-8. Do not use csv.reader(). Use
+# this instead.
+def locale_csv_reader(iterable, dialect='excel', **kwargs):
+    rows = csv.reader(iterable, dialect=dialect, **kwargs)
+    for row in rows:
+        yield [unicode(cell, LOCALE) for cell in row]
+
+
 def configure_logging(debug, quiet=False):
     """Configures the default logger. This can be called only once and has to
     be called before any logging is done.
@@ -164,7 +177,7 @@ def get_cmd_in_bin_dir(binary_name):
 
 
 def write_cmd_streams_to_file(logfile, cmd=None, out=None, err=None):
-    with open(logfile, 'w') as log_filedesc:
+    with codecs.open(logfile, 'w', encoding=LOCALE) as log_filedesc:
         if cmd:
             log_filedesc.write(' '.join(cmd) + '\n')
         if err is not None:
@@ -424,14 +437,19 @@ class Indenter(str):
         self.text += '\n'
 
     def add(self, x):
+        if type(x) != unicode:
+            x = x.decode(LOCALE)
         lines = x.splitlines()
         indent = self.indent_get()
         lines = [indent + l for l in lines]
         self.text += '\n'.join(lines)
 
-    def __str__(self):
+    def __unicode__(self):
         return self.text
 
+    def __str__(self):
+        return unicode(self).encode(LOCALE)
+
 
 def syntax_highlighting(source_name, mode, s):
     if pygments is None or mode == PLAIN_FORMATTER:
@@ -454,14 +472,14 @@ def build_source_context(source_name, mode, report_line):
     n_length = len(str(end_line))
     line_number = 1
     s = ''
-    with open(source_name) as source_file:
+    with codecs.open(source_name, 'r', encoding=LOCALE) as source_file:
         for line in source_file:
             if start_line <= line_number <= end_line:
                 num = str(line_number).zfill(n_length)
                 caret = '  '
                 if line_number == report_line:
                     caret = '> '
-                s += num + '. ' + caret + line
+                s += u'%s. %s%s' % (num, caret, line)
             line_number += 1
     return syntax_highlighting(source_name, mode, s)