diff --git a/inference_from_record_pairs.py b/deprecated/inference_from_record_pairs.py similarity index 93% rename from inference_from_record_pairs.py rename to deprecated/inference_from_record_pairs.py index 5efc81f..4d1b96a 100644 --- a/inference_from_record_pairs.py +++ b/deprecated/inference_from_record_pairs.py @@ -3,7 +3,6 @@ import time import Levenshtein import copy - def my_Levenshtein_ratio(str1, str2): return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) @@ -131,21 +130,21 @@ def inference_from_record_pairs(path, threshold, target_col): if __name__ == '__main__': # 目前可以仿照这个main函数写 - path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/DBLP-ACM/output/7.6/TP_single_tuple.csv" + path = "input/T_positive_with_id_concat_single_tuple.csv" start = time.time() # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 - mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id') + mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id_concat') # 将列表1写入本地,路径需自己修改 - md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt' + md_path = 'output/md.txt' with open(md_path, 'w') as f: for _ in mds: f.write(str(_) + '\n') # 将列表2写入本地,路径需自己修改 - vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt' + vio_path = 'output/vio.txt' with open(vio_path, 'w') as f: for _ in mds_vio: f.write(str(_) + '\n') diff --git a/multi_process_infer_by_pairs.py b/functions/multi_process_infer_by_pairs.py similarity index 90% rename from multi_process_infer_by_pairs.py rename to functions/multi_process_infer_by_pairs.py index 911b402..199d4ef 100644 --- a/multi_process_infer_by_pairs.py +++ b/functions/multi_process_infer_by_pairs.py @@ -6,13 +6,18 @@ import copy conf_thresh = 0.8 + def my_Levenshtein_ratio(str1, str2): + if max(len(str1), len(str2)) == 0: + return 1 return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) def if_minimal(md, md_list, target_col): # 假设这个md是minimal minimal = True + if md_list.count(md) > 1: + return False for _ in md_list: if _ != md: # 假设列表中每一个md都使当前md不minimal @@ -48,6 +53,7 @@ def remove_by_confidence(md, l, relation, target_col, lock): def inference_from_record_pairs(path, threshold, target_col): data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') + data.fillna("", inplace=True) data = data.astype(str) columns = data.columns.values.tolist() @@ -71,7 +77,8 @@ def inference_from_record_pairs(path, threshold, target_col): sims[col] = similarity # 寻找violated md,从md列表中删除并加入vio列表 - for md in md_list: + # tmp_md_list = copy.deepcopy(md_list) + for md in md_list[:]: lhs_satis = True rhs_satis = True for col in list(set(columns) - {target_col}): @@ -101,33 +108,39 @@ def inference_from_record_pairs(path, threshold, target_col): if if_minimal(spec_l_md, md_list, target_col): md_list.append(spec_l_md) - for vio in minimal_vio: + # tmp_minimal_vio = copy.deepcopy(minimal_vio) + for vio in minimal_vio[:]: if not if_minimal(vio, md_list, target_col): minimal_vio.remove(vio) manager = multiprocessing.Manager() lock = manager.Lock() if len(minimal_vio) == 0: - return [], [] + return md_list, [] pool = multiprocessing.Pool(len(minimal_vio)) - tmp = copy.deepcopy(minimal_vio) + # tmp = copy.deepcopy(minimal_vio) with manager: proxy_minimal_vio = manager.list(minimal_vio) - for _ in tmp: + for _ in minimal_vio[:]: pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, target_col, lock)) pool.close() pool.join() minimal_vio = list(proxy_minimal_vio) - for _ in tmp: + for _ in minimal_vio[:]: if not if_minimal(_, minimal_vio, target_col): minimal_vio.remove(_) + for _ in md_list[:]: + if not if_minimal(_, md_list, target_col): + md_list.remove(_) + return md_list, minimal_vio def get_mds_metadata(md_list, dataset_path, target_col): data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1') + data.fillna("", inplace=True) data = data.astype(str) manager = multiprocessing.Manager() diff --git a/get_support_and_confidence.py b/script/get_support_and_confidence.py similarity index 71% rename from get_support_and_confidence.py rename to script/get_support_and_confidence.py index ecefada..8a16007 100644 --- a/get_support_and_confidence.py +++ b/script/get_support_and_confidence.py @@ -1,19 +1,19 @@ import time -from multi_process_infer_by_pairs import inference_from_record_pairs -from multi_process_infer_by_pairs import get_mds_metadata +from functions.multi_process_infer_by_pairs import inference_from_record_pairs +from functions.multi_process_infer_by_pairs import get_mds_metadata if __name__ == '__main__': # 目前可以仿照这个main函数写 - path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/output/8.14/TP_single_tuple.csv" + path = "/home/w/PycharmProjects/matching_dependency/input/T_positive_with_id_concat_single_tuple.csv" start = time.time() # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 - mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id') + mds, mds_vio = inference_from_record_pairs(path, 0.1, 'id_concat') # 如果不需要输出support和confidence,去掉下面两行 - mds_meta = get_mds_metadata(mds, path, 'id') - mds_vio_meta = get_mds_metadata(mds_vio, path, 'id') + mds_meta = get_mds_metadata(mds, path, 'id_concat') + mds_vio_meta = get_mds_metadata(mds_vio, path, 'id_concat') # # 若不输出support和confidence,使用以下两块代码 # # 将列表1写入本地,路径需自己修改 @@ -30,7 +30,7 @@ if __name__ == '__main__': # 若输出support和confidence,使用以下两块代码 # 将列表1写入本地,路径需自己修改 - md_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt' + md_path = "output/md.txt" with open(md_path, 'w') as f: for _ in mds_meta: for i in _.keys(): @@ -38,11 +38,11 @@ if __name__ == '__main__': f.write('\n') # 将列表2写入本地,路径需自己修改 - vio_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt' + vio_path = "output/vio.txt" with open(vio_path, 'w') as f: for _ in mds_vio_meta: for i in _.keys(): f.write(i + ':' + str(_[i]) + '\t') f.write('\n') - print(time.time() - start) \ No newline at end of file + print(time.time() - start)