import os import random import pandas as pd import Levenshtein import ml_er.ml_entity_resolver def my_Levenshtein_ratio(str1, str2): if max(len(str1), len(str2)) == 0: return 1 return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) def load_mds(paths: list) -> list: if len(paths) == 0: return [] all_mds = [] # 传入md路径列表 for md_path in paths: if not os.path.exists(md_path): continue mds = [] # 打开每一个md文件 with open(md_path, 'r') as f: # 读取每一行的md,加入该文件的md列表 for line in f.readlines(): md_metadata = line.strip().split('\t') md = eval(md_metadata[0].replace('md:', '')) confidence = eval(md_metadata[2].replace('confidence:', '')) if confidence > 0: mds.append(md) all_mds.extend(mds) return all_mds # 输入: md地址列表/预测表地址/随机生成次数 # 输出: 一些正样本(带gold列不带prediction列) def generate_samples(md_path_list, pred_path, count: int): all_mds = load_mds(md_path_list) predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1') predictions.fillna("", inplace=True) predictions = predictions.astype(str) pred_attrs = predictions.columns.values.tolist() # 预测表中的字段,带前缀,包括gold和predict attrs = [] # 不带前缀的字段,不包括gold和predict l_attrs = [] r_attrs = [] for _ in pred_attrs: if _.startswith('ltable_'): attrs.append(_.replace('ltable_', '')) l_attrs.append(_) elif _.startswith('rtable'): r_attrs.append(_) fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')] fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')] fpl = fp[l_attrs] fpr = fp[r_attrs] # 将左右两部分字段名统一 fpl.columns = attrs fpr.columns = attrs fnl = fn[l_attrs] fnr = fn[r_attrs] fnl.columns = attrs fnr.columns = attrs fp = pd.concat([fpl, fpr]) fn = pd.concat([fnl, fnr]) df = pd.concat([fp, fn]) length = len(df) result = pd.DataFrame() for i in range(0, count): dic = {} for _ in attrs: if _ == 'id': index = random.randint(0, length-1) value = df.iloc[index]['id'] dic['ltable_'+_] = value dic['rtable_'+_] = value else: index1 = random.randint(0, length-1) index2 = random.randint(0, length-1) value1 = df.iloc[index1][_] value2 = df.iloc[index2][_] dic['ltable_'+_] = value1 dic['rtable_'+_] = value2 for md in all_mds: satis = True for _ in attrs: if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]: satis = False break if satis: series = pd.Series(dic) result = result._append(series, ignore_index=True) result['gold'] = 1 return result # 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空) if __name__ == '__main__': md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt', '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt', '/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt', '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt'] pre_p = '/home/w/pred.csv' generate_samples(md_paths, pre_p, 10000) # 随机生成次数写个一千一万都没问题