第三次合并 #3

Merged
p04316279 merged 2 commits from HuangJintao into master 1 year ago

@ -3,7 +3,6 @@ import time
import Levenshtein import Levenshtein
import copy import copy
def my_Levenshtein_ratio(str1, str2): def my_Levenshtein_ratio(str1, str2):
return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
@ -131,21 +130,21 @@ def inference_from_record_pairs(path, threshold, target_col):
if __name__ == '__main__': if __name__ == '__main__':
# 目前可以仿照这个main函数写 # 目前可以仿照这个main函数写
path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/DBLP-ACM/output/7.6/TP_single_tuple.csv" path = "input/T_positive_with_id_concat_single_tuple.csv"
start = time.time() start = time.time()
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
# 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段
mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id') mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id_concat')
# 将列表1写入本地路径需自己修改 # 将列表1写入本地路径需自己修改
md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt' md_path = 'output/md.txt'
with open(md_path, 'w') as f: with open(md_path, 'w') as f:
for _ in mds: for _ in mds:
f.write(str(_) + '\n') f.write(str(_) + '\n')
# 将列表2写入本地路径需自己修改 # 将列表2写入本地路径需自己修改
vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt' vio_path = 'output/vio.txt'
with open(vio_path, 'w') as f: with open(vio_path, 'w') as f:
for _ in mds_vio: for _ in mds_vio:
f.write(str(_) + '\n') f.write(str(_) + '\n')

@ -6,13 +6,18 @@ import copy
conf_thresh = 0.8 conf_thresh = 0.8
def my_Levenshtein_ratio(str1, str2): def my_Levenshtein_ratio(str1, str2):
if max(len(str1), len(str2)) == 0:
return 1
return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
def if_minimal(md, md_list, target_col): def if_minimal(md, md_list, target_col):
# 假设这个md是minimal # 假设这个md是minimal
minimal = True minimal = True
if md_list.count(md) > 1:
return False
for _ in md_list: for _ in md_list:
if _ != md: if _ != md:
# 假设列表中每一个md都使当前md不minimal # 假设列表中每一个md都使当前md不minimal
@ -48,6 +53,7 @@ def remove_by_confidence(md, l, relation, target_col, lock):
def inference_from_record_pairs(path, threshold, target_col): def inference_from_record_pairs(path, threshold, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str) data = data.astype(str)
columns = data.columns.values.tolist() columns = data.columns.values.tolist()
@ -71,7 +77,8 @@ def inference_from_record_pairs(path, threshold, target_col):
sims[col] = similarity sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表 # 寻找violated md,从md列表中删除并加入vio列表
for md in md_list: # tmp_md_list = copy.deepcopy(md_list)
for md in md_list[:]:
lhs_satis = True lhs_satis = True
rhs_satis = True rhs_satis = True
for col in list(set(columns) - {target_col}): for col in list(set(columns) - {target_col}):
@ -101,33 +108,39 @@ def inference_from_record_pairs(path, threshold, target_col):
if if_minimal(spec_l_md, md_list, target_col): if if_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md) md_list.append(spec_l_md)
for vio in minimal_vio: # tmp_minimal_vio = copy.deepcopy(minimal_vio)
for vio in minimal_vio[:]:
if not if_minimal(vio, md_list, target_col): if not if_minimal(vio, md_list, target_col):
minimal_vio.remove(vio) minimal_vio.remove(vio)
manager = multiprocessing.Manager() manager = multiprocessing.Manager()
lock = manager.Lock() lock = manager.Lock()
if len(minimal_vio) == 0: if len(minimal_vio) == 0:
return [], [] return md_list, []
pool = multiprocessing.Pool(len(minimal_vio)) pool = multiprocessing.Pool(len(minimal_vio))
tmp = copy.deepcopy(minimal_vio) # tmp = copy.deepcopy(minimal_vio)
with manager: with manager:
proxy_minimal_vio = manager.list(minimal_vio) proxy_minimal_vio = manager.list(minimal_vio)
for _ in tmp: for _ in minimal_vio[:]:
pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, target_col, lock)) pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, target_col, lock))
pool.close() pool.close()
pool.join() pool.join()
minimal_vio = list(proxy_minimal_vio) minimal_vio = list(proxy_minimal_vio)
for _ in tmp: for _ in minimal_vio[:]:
if not if_minimal(_, minimal_vio, target_col): if not if_minimal(_, minimal_vio, target_col):
minimal_vio.remove(_) minimal_vio.remove(_)
for _ in md_list[:]:
if not if_minimal(_, md_list, target_col):
md_list.remove(_)
return md_list, minimal_vio return md_list, minimal_vio
def get_mds_metadata(md_list, dataset_path, target_col): def get_mds_metadata(md_list, dataset_path, target_col):
data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1') data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str) data = data.astype(str)
manager = multiprocessing.Manager() manager = multiprocessing.Manager()

@ -1,19 +1,19 @@
import time import time
from multi_process_infer_by_pairs import inference_from_record_pairs from functions.multi_process_infer_by_pairs import inference_from_record_pairs
from multi_process_infer_by_pairs import get_mds_metadata from functions.multi_process_infer_by_pairs import get_mds_metadata
if __name__ == '__main__': if __name__ == '__main__':
# 目前可以仿照这个main函数写 # 目前可以仿照这个main函数写
path = "/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/output/8.14/TP_single_tuple.csv" path = "/home/w/PycharmProjects/matching_dependency/input/T_positive_with_id_concat_single_tuple.csv"
start = time.time() start = time.time()
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
# 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段
mds, mds_vio = inference_from_record_pairs(path, 0.7, 'id') mds, mds_vio = inference_from_record_pairs(path, 0.1, 'id_concat')
# 如果不需要输出support和confidence去掉下面两行 # 如果不需要输出support和confidence去掉下面两行
mds_meta = get_mds_metadata(mds, path, 'id') mds_meta = get_mds_metadata(mds, path, 'id_concat')
mds_vio_meta = get_mds_metadata(mds_vio, path, 'id') mds_vio_meta = get_mds_metadata(mds_vio, path, 'id_concat')
# # 若不输出support和confidence使用以下两块代码 # # 若不输出support和confidence使用以下两块代码
# # 将列表1写入本地路径需自己修改 # # 将列表1写入本地路径需自己修改
@ -30,7 +30,7 @@ if __name__ == '__main__':
# 若输出support和confidence使用以下两块代码 # 若输出support和confidence使用以下两块代码
# 将列表1写入本地路径需自己修改 # 将列表1写入本地路径需自己修改
md_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt' md_path = "output/md.txt"
with open(md_path, 'w') as f: with open(md_path, 'w') as f:
for _ in mds_meta: for _ in mds_meta:
for i in _.keys(): for i in _.keys():
@ -38,11 +38,11 @@ if __name__ == '__main__':
f.write('\n') f.write('\n')
# 将列表2写入本地路径需自己修改 # 将列表2写入本地路径需自己修改
vio_path = '/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt' vio_path = "output/vio.txt"
with open(vio_path, 'w') as f: with open(vio_path, 'w') as f:
for _ in mds_vio_meta: for _ in mds_vio_meta:
for i in _.keys(): for i in _.keys():
f.write(i + ':' + str(_[i]) + '\t') f.write(i + ':' + str(_[i]) + '\t')
f.write('\n') f.write('\n')
print(time.time() - start) print(time.time() - start)
Loading…
Cancel
Save