From b0370a1c1bb5616a031ab6022682df412604c872 Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Thu, 7 Sep 2023 22:05:09 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=8C=96?=
 =?UTF-8?q?=E6=8E=98=E5=87=BA=E9=87=8D=E5=A4=8Dmd=E7=9A=84bug=EF=BC=9B=20?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=97=E8=A1=A8=E9=81=8D=E5=8E=86=E6=96=B9?=
 =?UTF-8?q?=E5=BC=8F=EF=BC=8C=E9=81=BF=E5=85=8D=E5=8F=AF=E8=83=BD=E5=87=BA?=
 =?UTF-8?q?=E7=8E=B0=E7=9A=84=E5=BC=82=E5=B8=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../multi_process_infer_by_pairs.py           | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)
 rename multi_process_infer_by_pairs.py => functions/multi_process_infer_by_pairs.py (90%)

diff --git a/multi_process_infer_by_pairs.py b/functions/multi_process_infer_by_pairs.py
similarity index 90%
rename from multi_process_infer_by_pairs.py
rename to functions/multi_process_infer_by_pairs.py
index 911b402..199d4ef 100644
--- a/multi_process_infer_by_pairs.py
+++ b/functions/multi_process_infer_by_pairs.py
@@ -6,13 +6,18 @@ import copy
 
 conf_thresh = 0.8
 
+
 def my_Levenshtein_ratio(str1, str2):
+    if max(len(str1), len(str2)) == 0:
+        return 1
     return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
 
 
 def if_minimal(md, md_list, target_col):
     # 假设这个md是minimal
     minimal = True
+    if md_list.count(md) > 1:
+        return False
     for _ in md_list:
         if _ != md:
             # 假设列表中每一个md都使当前md不minimal
@@ -48,6 +53,7 @@ def remove_by_confidence(md, l, relation, target_col, lock):
 
 def inference_from_record_pairs(path, threshold, target_col):
     data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
+    data.fillna("", inplace=True)
     data = data.astype(str)
     columns = data.columns.values.tolist()
 
@@ -71,7 +77,8 @@ def inference_from_record_pairs(path, threshold, target_col):
                 sims[col] = similarity
 
             # 寻找violated md,从md列表中删除并加入vio列表
-            for md in md_list:
+            # tmp_md_list = copy.deepcopy(md_list)
+            for md in md_list[:]:
                 lhs_satis = True
                 rhs_satis = True
                 for col in list(set(columns) - {target_col}):
@@ -101,33 +108,39 @@ def inference_from_record_pairs(path, threshold, target_col):
                         if if_minimal(spec_l_md, md_list, target_col):
                             md_list.append(spec_l_md)
 
-            for vio in minimal_vio:
+            # tmp_minimal_vio = copy.deepcopy(minimal_vio)
+            for vio in minimal_vio[:]:
                 if not if_minimal(vio, md_list, target_col):
                     minimal_vio.remove(vio)
 
     manager = multiprocessing.Manager()
     lock = manager.Lock()
     if len(minimal_vio) == 0:
-        return [], []
+        return md_list, []
     pool = multiprocessing.Pool(len(minimal_vio))
-    tmp = copy.deepcopy(minimal_vio)
+    # tmp = copy.deepcopy(minimal_vio)
     with manager:
         proxy_minimal_vio = manager.list(minimal_vio)
-        for _ in tmp:
+        for _ in minimal_vio[:]:
             pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, target_col, lock))
         pool.close()
         pool.join()
         minimal_vio = list(proxy_minimal_vio)
 
-    for _ in tmp:
+    for _ in minimal_vio[:]:
         if not if_minimal(_, minimal_vio, target_col):
             minimal_vio.remove(_)
 
+    for _ in md_list[:]:
+        if not if_minimal(_, md_list, target_col):
+            md_list.remove(_)
+
     return md_list, minimal_vio
 
 
 def get_mds_metadata(md_list, dataset_path, target_col):
     data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
+    data.fillna("", inplace=True)
     data = data.astype(str)
 
     manager = multiprocessing.Manager()

From 0902e31e983e0a63f1bb81c1645491d61bb0dfac Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Thu, 7 Sep 2023 22:14:00 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=8C=96?=
 =?UTF-8?q?=E6=8E=98=E5=87=BA=E9=87=8D=E5=A4=8Dmd=E7=9A=84bug=EF=BC=9B=20?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=97=E8=A1=A8=E9=81=8D=E5=8E=86=E6=96=B9?=
 =?UTF-8?q?=E5=BC=8F=EF=BC=8C=E9=81=BF=E5=85=8D=E5=8F=AF=E8=83=BD=E5=87=BA?=
 =?UTF-8?q?=E7=8E=B0=E7=9A=84=E5=BC=82=E5=B8=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../inference_from_record_pairs.py             |  9 ++++-----
 .../get_support_and_confidence.py              | 18 +++++++++---------
 2 files changed, 13 insertions(+), 14 deletions(-)
 rename inference_from_record_pairs.py => deprecated/inference_from_record_pairs.py (93%)
 rename get_support_and_confidence.py => script/get_support_and_confidence.py (71%)

diff --git a/inference_from_record_pairs.py b/deprecated/inference_from_record_pairs.py
similarity index 93%
rename from inference_from_record_pairs.py
rename to deprecated/inference_from_record_pairs.py
index 5efc81f..4d1b96a 100644
--- a/inference_from_record_pairs.py
+++ b/deprecated/inference_from_record_pairs.py
@@ -3,7 +3,6 @@ import time
 import Levenshtein
 import copy
 
-
 def my_Levenshtein_ratio(str1, str2):
     return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
 
@@ -131,21 +130,21 @@ def inference_from_record_pairs(path, threshold, target_col):
 
 if __name__ == '__main__':
     # 目前可以仿照这个main函数写
-    path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/DBLP-ACM/output/7.6/TP_single_tuple.csv"
+    path = "input/T_positive_with_id_concat_single_tuple.csv"
     start = time.time()
     # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
     # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
     # 例如此处输入参数要求md左侧相似度字段至少为0.7，右侧指向'id'字段
-    mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id')
+    mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id_concat')
 
     # 将列表1写入本地，路径需自己修改
-    md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt'
+    md_path = 'output/md.txt'
     with open(md_path, 'w') as f:
         for _ in mds:
             f.write(str(_) + '\n')
 
     # 将列表2写入本地，路径需自己修改
-    vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt'
+    vio_path = 'output/vio.txt'
     with open(vio_path, 'w') as f:
         for _ in mds_vio:
             f.write(str(_) + '\n')
diff --git a/get_support_and_confidence.py b/script/get_support_and_confidence.py
similarity index 71%
rename from get_support_and_confidence.py
rename to script/get_support_and_confidence.py
index ecefada..8a16007 100644
--- a/get_support_and_confidence.py
+++ b/script/get_support_and_confidence.py
@@ -1,19 +1,19 @@
 import time
-from multi_process_infer_by_pairs import inference_from_record_pairs
-from multi_process_infer_by_pairs import get_mds_metadata
+from functions.multi_process_infer_by_pairs import inference_from_record_pairs
+from functions.multi_process_infer_by_pairs import get_mds_metadata
 
 if __name__ == '__main__':
     # 目前可以仿照这个main函数写
-    path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/output/8.14/TP_single_tuple.csv"
+    path = "/home/w/PycharmProjects/matching_dependency/input/T_positive_with_id_concat_single_tuple.csv"
     start = time.time()
     # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
     # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
     # 例如此处输入参数要求md左侧相似度字段至少为0.7，右侧指向'id'字段
-    mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id')
+    mds, mds_vio = inference_from_record_pairs(path, 0.1, 'id_concat')
 
     # 如果不需要输出support和confidence，去掉下面两行
-    mds_meta = get_mds_metadata(mds, path, 'id')
-    mds_vio_meta = get_mds_metadata(mds_vio, path, 'id')
+    mds_meta = get_mds_metadata(mds, path, 'id_concat')
+    mds_vio_meta = get_mds_metadata(mds_vio, path, 'id_concat')
 
     # # 若不输出support和confidence，使用以下两块代码
     # # 将列表1写入本地，路径需自己修改
@@ -30,7 +30,7 @@ if __name__ == '__main__':
 
     # 若输出support和confidence，使用以下两块代码
     # 将列表1写入本地，路径需自己修改
-    md_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt'
+    md_path = "output/md.txt"
     with open(md_path, 'w') as f:
         for _ in mds_meta:
             for i in _.keys():
@@ -38,11 +38,11 @@ if __name__ == '__main__':
             f.write('\n')
 
     # 将列表2写入本地，路径需自己修改
-    vio_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt'
+    vio_path = "output/vio.txt"
     with open(vio_path, 'w') as f:
         for _ in mds_vio_meta:
             for i in _.keys():
                 f.write(i + ':' + str(_[i]) + '\t')
             f.write('\n')
 
-    print(time.time() - start)
\ No newline at end of file
+    print(time.time() - start)