diff --git a/get_support_and_confidence.py b/get_support_and_confidence.py new file mode 100644 index 0000000..ecefada --- /dev/null +++ b/get_support_and_confidence.py @@ -0,0 +1,48 @@ +import time +from multi_process_infer_by_pairs import inference_from_record_pairs +from multi_process_infer_by_pairs import get_mds_metadata + +if __name__ == '__main__': + # 目前可以仿照这个main函数写 + path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/output/8.14/TP_single_tuple.csv" + start = time.time() + # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 + # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) + # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 + mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id') + + # 如果不需要输出support和confidence,去掉下面两行 + mds_meta = get_mds_metadata(mds, path, 'id') + mds_vio_meta = get_mds_metadata(mds_vio, path, 'id') + + # # 若不输出support和confidence,使用以下两块代码 + # # 将列表1写入本地,路径需自己修改 + # md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt' + # with open(md_path, 'w') as f: + # for _ in mds: + # f.write(str(_) + '\n') + # + # # 将列表2写入本地,路径需自己修改 + # vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt' + # with open(vio_path, 'w') as f: + # for _ in mds_vio: + # f.write(str(_) + '\n') + + # 若输出support和confidence,使用以下两块代码 + # 将列表1写入本地,路径需自己修改 + md_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt' + with open(md_path, 'w') as f: + for _ in mds_meta: + for i in _.keys(): + f.write(i + ':' + str(_[i]) + '\t') + f.write('\n') + + # 将列表2写入本地,路径需自己修改 + vio_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt' + with open(vio_path, 'w') as f: + for _ in mds_vio_meta: + for i in _.keys(): + f.write(i + ':' + str(_[i]) + '\t') + f.write('\n') + + print(time.time() - start) \ No newline at end of file diff --git a/inference_from_record_pairs.py b/inference_from_record_pairs.py index a2bd590..5efc81f 100644 --- a/inference_from_record_pairs.py +++ b/inference_from_record_pairs.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd import time import Levenshtein @@ -17,7 +16,7 @@ def if_minimal(md, md_list, target_col): # 假设列表中每一个md都使当前md不minimal exist = True # 如果左边任何一个大于,则假设不成立 - for col in list(set(_.keys()) - set([target_col])): + for col in list(set(_.keys()) - {target_col}): if _[col] > md[col]: exist = False # 如果右边小于,假设也不成立 @@ -52,8 +51,10 @@ def satisfy_confidence(md, df, conf_thresh, target_col): support += 1 if both_satisfy: support_plus += 1 + if support == 0: + return False, 0.0 confidence = support_plus / support - return confidence >= conf_thresh + return confidence >= conf_thresh, confidence def inference_from_record_pairs(path, threshold, target_col): @@ -84,7 +85,7 @@ def inference_from_record_pairs(path, threshold, target_col): for md in md_list: lhs_satis = True rhs_satis = True - for col in list(set(columns) - set([target_col])): + for col in list(set(columns) - {target_col}): if sims[col] < md[col]: lhs_satis = False if sims[target_col] < md[target_col]: @@ -104,7 +105,7 @@ def inference_from_record_pairs(path, threshold, target_col): # md_list.append(spec_r_md) # 特殊化左侧 - for col in list(set(columns) - set([target_col])): + for col in list(set(columns) - {target_col}): if sims[col] + 0.001 <= 1: spec_l_md = copy.deepcopy(vio_md) spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.001 @@ -115,12 +116,13 @@ def inference_from_record_pairs(path, threshold, target_col): if not if_minimal(vio, md_list, target_col): minimal_vio.remove(vio) - for _ in minimal_vio: - if not satisfy_confidence(_, data, 0.8, target_col): + tmp = copy.deepcopy(minimal_vio) + for _ in tmp: + satis, conf = satisfy_confidence(_, data, 0.8, target_col) + if not satis: minimal_vio.remove(_) - list1 = copy.deepcopy(minimal_vio) - for _ in list1: + for _ in tmp: if not if_minimal(_, minimal_vio, target_col): minimal_vio.remove(_) @@ -129,7 +131,7 @@ def inference_from_record_pairs(path, threshold, target_col): if __name__ == '__main__': # 目前可以仿照这个main函数写 - path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/output/8.14/TP_single_tuple.csv" + path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/DBLP-ACM/output/7.6/TP_single_tuple.csv" start = time.time() # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) @@ -137,15 +139,15 @@ if __name__ == '__main__': mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id') # 将列表1写入本地,路径需自己修改 - md_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt' + md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt' with open(md_path, 'w') as f: for _ in mds: - f.write(str(_)+'\n') + f.write(str(_) + '\n') # 将列表2写入本地,路径需自己修改 - vio_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt' + vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt' with open(vio_path, 'w') as f: for _ in mds_vio: - f.write(str(_)+'\n') + f.write(str(_) + '\n') print(time.time() - start) diff --git a/multi_process_infer_by_pairs.py b/multi_process_infer_by_pairs.py new file mode 100644 index 0000000..911b402 --- /dev/null +++ b/multi_process_infer_by_pairs.py @@ -0,0 +1,173 @@ +import multiprocessing +import pandas as pd +import Levenshtein +import copy + + +conf_thresh = 0.8 + +def my_Levenshtein_ratio(str1, str2): + return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) + + +def if_minimal(md, md_list, target_col): + # 假设这个md是minimal + minimal = True + for _ in md_list: + if _ != md: + # 假设列表中每一个md都使当前md不minimal + exist = True + # 如果左边任何一个大于,则假设不成立 + for col in list(set(_.keys()) - {target_col}): + if _[col] > md[col]: + exist = False + # 如果右边小于,假设也不成立 + if _[target_col] < md[target_col]: + exist = False + # 任何一次假设成立,当前md不minimal + if exist: + minimal = False + break + return minimal + + +def remove_by_confidence(md, l, relation, target_col, lock): + support, confidence = get_one_md_metadata(md, relation, target_col) + # todo: replace constant 0.8 + if confidence < 0.8: + with lock: + l.remove(md) + + +# def remove_by_confidence(md, l, relation, target_col): +# boolean, conf = satisfy_confidence(md, relation, 0.8, target_col) +# if not boolean: +# l.remove(md) +# print(md, '\t', conf) + + +def inference_from_record_pairs(path, threshold, target_col): + data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') + data = data.astype(str) + columns = data.columns.values.tolist() + + md_list = [] + minimal_vio = [] + init_md = {} + for col in columns: + init_md[col] = 1 if col == target_col else 0 + md_list.append(init_md) + + for row1 in data.itertuples(): + # 获取当前行的索引,从后一行开始切片 + i = row1[0] + data1 = data[i + 1:] + for row2 in data1.itertuples(): + violated_mds = [] + # sims是两行的相似度 + sims = {} + for col in columns: + similarity = my_Levenshtein_ratio(getattr(row1, col), getattr(row2, col)) + sims[col] = similarity + + # 寻找violated md,从md列表中删除并加入vio列表 + for md in md_list: + lhs_satis = True + rhs_satis = True + for col in list(set(columns) - {target_col}): + if sims[col] < md[col]: + lhs_satis = False + if sims[target_col] < md[target_col]: + rhs_satis = False + if lhs_satis == True and rhs_satis == False: + md_list.remove(md) + violated_mds.append(md) + minimal_vio.extend(violated_mds) + + for vio_md in violated_mds: + # 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值 + # if sims[target_col] >= threshold: + # new_rhs = sims[target_col] + # spec_r_md = copy.deepcopy(vio_md) + # spec_r_md[target_col] = new_rhs + # if if_minimal(spec_r_md, md_list, target_col): + # md_list.append(spec_r_md) + + # 特殊化左侧 + for col in list(set(columns) - {target_col}): + if sims[col] + 0.001 <= 1: + spec_l_md = copy.deepcopy(vio_md) + spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.001 + if if_minimal(spec_l_md, md_list, target_col): + md_list.append(spec_l_md) + + for vio in minimal_vio: + if not if_minimal(vio, md_list, target_col): + minimal_vio.remove(vio) + + manager = multiprocessing.Manager() + lock = manager.Lock() + if len(minimal_vio) == 0: + return [], [] + pool = multiprocessing.Pool(len(minimal_vio)) + tmp = copy.deepcopy(minimal_vio) + with manager: + proxy_minimal_vio = manager.list(minimal_vio) + for _ in tmp: + pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, target_col, lock)) + pool.close() + pool.join() + minimal_vio = list(proxy_minimal_vio) + + for _ in tmp: + if not if_minimal(_, minimal_vio, target_col): + minimal_vio.remove(_) + + return md_list, minimal_vio + + +def get_mds_metadata(md_list, dataset_path, target_col): + data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1') + data = data.astype(str) + + manager = multiprocessing.Manager() + if len(md_list) == 0: + return [] + pool = multiprocessing.Pool(len(md_list)) + result = [] + with manager: + for _ in md_list: + task = pool.apply_async(get_one_md_metadata, args=(_, data, target_col)) + support, confidence = task.get() + result.append({"md": _, "support": support, "confidence": confidence}) + pool.close() + pool.join() + return result + + +def get_one_md_metadata(md, dataframe, target_col): + support = 0 + pre_confidence = 0 + for row1 in dataframe.itertuples(): + i = row1[0] + df_slice = dataframe[i + 1:] + for row2 in df_slice.itertuples(): + left_satisfy = True + both_satisfy = True + for col in dataframe.columns.values.tolist(): + sim = my_Levenshtein_ratio(getattr(row1, col), getattr(row2, col)) + if col == target_col: + if sim < 1: + both_satisfy = False + else: + if sim < md[col]: + left_satisfy = False + both_satisfy = False + if left_satisfy: + support += 1 + if both_satisfy: + pre_confidence += 1 + + confidence = 0 if support == 0 else pre_confidence / support + # return {"md": md, "support": support, "confidence": confidence} + return support, confidence