From b0370a1c1bb5616a031ab6022682df412604c872 Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Thu, 7 Sep 2023 22:05:09 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=8C=96=E6=8E=98?= =?UTF-8?q?=E5=87=BA=E9=87=8D=E5=A4=8Dmd=E7=9A=84bug=EF=BC=9B=20=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E5=88=97=E8=A1=A8=E9=81=8D=E5=8E=86=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=EF=BC=8C=E9=81=BF=E5=85=8D=E5=8F=AF=E8=83=BD=E5=87=BA=E7=8E=B0?= =?UTF-8?q?=E7=9A=84=E5=BC=82=E5=B8=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../multi_process_infer_by_pairs.py | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) rename multi_process_infer_by_pairs.py => functions/multi_process_infer_by_pairs.py (90%) diff --git a/multi_process_infer_by_pairs.py b/functions/multi_process_infer_by_pairs.py similarity index 90% rename from multi_process_infer_by_pairs.py rename to functions/multi_process_infer_by_pairs.py index 911b402..199d4ef 100644 --- a/multi_process_infer_by_pairs.py +++ b/functions/multi_process_infer_by_pairs.py @@ -6,13 +6,18 @@ import copy conf_thresh = 0.8 + def my_Levenshtein_ratio(str1, str2): + if max(len(str1), len(str2)) == 0: + return 1 return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) def if_minimal(md, md_list, target_col): # 假设这个md是minimal minimal = True + if md_list.count(md) > 1: + return False for _ in md_list: if _ != md: # 假设列表中每一个md都使当前md不minimal @@ -48,6 +53,7 @@ def remove_by_confidence(md, l, relation, target_col, lock): def inference_from_record_pairs(path, threshold, target_col): data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') + data.fillna("", inplace=True) data = data.astype(str) columns = data.columns.values.tolist() @@ -71,7 +77,8 @@ def inference_from_record_pairs(path, threshold, target_col): sims[col] = similarity # 寻找violated md,从md列表中删除并加入vio列表 - for md in md_list: + # tmp_md_list = copy.deepcopy(md_list) + for md in md_list[:]: lhs_satis = True rhs_satis = True for col in list(set(columns) - {target_col}): @@ -101,33 +108,39 @@ def inference_from_record_pairs(path, threshold, target_col): if if_minimal(spec_l_md, md_list, target_col): md_list.append(spec_l_md) - for vio in minimal_vio: + # tmp_minimal_vio = copy.deepcopy(minimal_vio) + for vio in minimal_vio[:]: if not if_minimal(vio, md_list, target_col): minimal_vio.remove(vio) manager = multiprocessing.Manager() lock = manager.Lock() if len(minimal_vio) == 0: - return [], [] + return md_list, [] pool = multiprocessing.Pool(len(minimal_vio)) - tmp = copy.deepcopy(minimal_vio) + # tmp = copy.deepcopy(minimal_vio) with manager: proxy_minimal_vio = manager.list(minimal_vio) - for _ in tmp: + for _ in minimal_vio[:]: pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, target_col, lock)) pool.close() pool.join() minimal_vio = list(proxy_minimal_vio) - for _ in tmp: + for _ in minimal_vio[:]: if not if_minimal(_, minimal_vio, target_col): minimal_vio.remove(_) + for _ in md_list[:]: + if not if_minimal(_, md_list, target_col): + md_list.remove(_) + return md_list, minimal_vio def get_mds_metadata(md_list, dataset_path, target_col): data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1') + data.fillna("", inplace=True) data = data.astype(str) manager = multiprocessing.Manager()