From 37d2e84192bee2bd5e08de522066cd7702d46e98 Mon Sep 17 00:00:00 2001
From: Jules Villard <jul@fb.com>
Date: Wed, 25 Nov 2015 07:39:32 -0800
Subject: [PATCH] dedup merged json reports

Summary: public
When merging json reports from different buck targets, the same bug may be
reported several times.

Clean up some bug sorting functions while I'm at it.

Reviewed By: martinoluca

Differential Revision: D2690665

fb-gh-sync-id: 4a12072
---
 infer/lib/python/inferlib/capture/buck.py |  4 +-
 infer/lib/python/inferlib/issues.py       | 45 ++++++++++-------------
 infer/lib/python/inferlib/utils.py        |  7 ----
 3 files changed, 22 insertions(+), 34 deletions(-)

diff --git a/infer/lib/python/inferlib/capture/buck.py b/infer/lib/python/inferlib/capture/buck.py
index d0abcbfef..e711fe3bf 100644
--- a/infer/lib/python/inferlib/capture/buck.py
+++ b/infer/lib/python/inferlib/capture/buck.py
@@ -13,7 +13,7 @@ import subprocess
 import traceback
 import util
 
-from inferlib import config, utils
+from inferlib import config, issues, utils
 
 MODULE_NAME = __name__
 MODULE_DESCRIPTION = '''Run analysis of code built with a command like:
@@ -136,7 +136,7 @@ class BuckAnalyzer:
         if not ret == os.EX_OK:
             return ret
         result_files = self._get_analysis_result_files()
-        all_results = utils.merge_json_arrays_from_files(result_files)
+        all_results = issues.merge_reports_from_paths(result_files)
         merged_results_path = os.path.join(self.args.infer_out,
                                            config.JSON_REPORT_FILENAME)
         utils.dump_json_to_path(all_results, merged_results_path)
diff --git a/infer/lib/python/inferlib/issues.py b/infer/lib/python/inferlib/issues.py
index 91f051861..ef90d1544 100644
--- a/infer/lib/python/inferlib/issues.py
+++ b/infer/lib/python/inferlib/issues.py
@@ -12,7 +12,9 @@ from __future__ import unicode_literals
 
 import codecs
 import csv
+import itertools
 import json
+import operator
 import os
 import shutil
 import sys
@@ -108,9 +110,8 @@ def clean_csv(args, csv_report):
                     if args.no_filtering \
                        or _should_report_csv(args.analyzer, row):
                         collected_rows.append(row)
-            collected_rows = sorted(
-                collected_rows,
-                cmp=_compare_csv_rows)
+            collected_rows.sort(key=operator.itemgetter(CSV_INDEX_FILENAME,
+                                                        CSV_INDEX_LINE))
             collected_rows = [rows[0]] + collected_rows
     temporary_file = tempfile.mktemp()
     with open(temporary_file, 'w') as file_out:
@@ -130,7 +131,8 @@ def clean_json(args, json_report):
                  _should_report_json(args.analyzer, row)))
 
     rows = filter(is_clean, rows)
-    rows.sort(cmp=_compare_json_rows)
+    rows.sort(key=operator.itemgetter(JSON_INDEX_FILENAME,
+                                      JSON_INDEX_LINE))
     temporary_file = tempfile.mktemp()
     utils.dump_json_to_path(rows, temporary_file)
     shutil.move(temporary_file, json_report)
@@ -193,29 +195,22 @@ def print_and_save_errors(json_report, bugs_out):
         file_out.write(text)
 
 
-def _compare_issues(filename_1, line_1, filename_2, line_2):
-    if filename_1 < filename_2:
-        return -1
-    elif filename_1 > filename_2:
-        return 1
-    else:
-        return line_1 - line_2
-
-
-def _compare_csv_rows(row_1, row_2):
-    filename_1 = row_1[CSV_INDEX_FILENAME]
-    filename_2 = row_2[CSV_INDEX_FILENAME]
-    line_1 = int(row_1[CSV_INDEX_LINE])
-    line_2 = int(row_2[CSV_INDEX_LINE])
-    return _compare_issues(filename_1, line_1, filename_2, line_2)
+def merge_reports_from_paths(report_paths):
+    json_data = []
+    for json_path in report_paths:
+        json_data.extend(utils.load_json_from_path(json_path))
+    return _sort_and_uniq_rows(json_data)
 
 
-def _compare_json_rows(row_1, row_2):
-    filename_1 = row_1[JSON_INDEX_FILENAME]
-    filename_2 = row_2[JSON_INDEX_FILENAME]
-    line_1 = row_1[JSON_INDEX_LINE]
-    line_2 = row_2[JSON_INDEX_LINE]
-    return _compare_issues(filename_1, line_1, filename_2, line_2)
+def _sort_and_uniq_rows(l):
+    key = operator.itemgetter(JSON_INDEX_FILENAME,
+                              JSON_INDEX_LINE,
+                              JSON_INDEX_HASH,
+                              JSON_INDEX_QUALIFIER)
+    l.sort(key=key)
+    groups = itertools.groupby(l, key)
+    # guaranteed to be at least one element in each group
+    return map(lambda (keys, dups): dups.next(), groups)
 
 
 def _should_report(analyzer, error_kind, error_type, error_bucket):
diff --git a/infer/lib/python/inferlib/utils.py b/infer/lib/python/inferlib/utils.py
index cc9ee4f72..923811b0b 100644
--- a/infer/lib/python/inferlib/utils.py
+++ b/infer/lib/python/inferlib/utils.py
@@ -147,13 +147,6 @@ def dump_json_to_path(
                   indent, separators, encoding, default, sort_keys, **kw)
 
 
-def merge_json_arrays_from_files(report_paths):
-    json_data = []
-    for json_path in report_paths:
-        json_data.extend(load_json_from_path(json_path))
-    return json_data
-
-
 def infer_version():
     version = json.loads(subprocess.check_output([
         get_cmd_in_bin_dir('InferAnalyze'),