新增脚本:根据MD生成正样本

HuangJintao
HuangJintao 1 year ago
parent b1c0abb664
commit 59dc97d2e2

@ -23,7 +23,7 @@ if __name__ == '__main__':
# todo
# 距离度量用户可设置?
# 使用drop删除特征向量中的列(如删除id相关特征)
run(1)
run(3) # 迭代3轮
# ml_er(1)
# todo 将优化结果与参数输出到文件中
# 通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息

@ -1,5 +1,4 @@
import os
import time
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
from ConfigSpace.conditions import InCondition
@ -8,8 +7,8 @@ import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario
from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
from settings import *
from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable
# 数据在外部加载
########################################################################################################################
@ -39,71 +38,6 @@ selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字
########################################################################################################################
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int,
test_proportion: float) -> dict:
new_df = df.reset_index(drop=False, inplace=False)
gold = new_df[labeled_attr]
predicted = new_df[predicted_attr]
gold_negative = gold[gold == 0].index.values
gold_positive = gold[gold == 1].index.values
predicted_negative = predicted[predicted == 0].index.values
predicted_positive = predicted[predicted == 1].index.values
false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
num_true_positives = float(len(true_positive_indices))
num_false_positives = float(len(false_positive_indices))
num_false_negatives = float(len(false_negative_indices))
precision_denominator = num_true_positives + num_false_positives
recall_denominator = num_true_positives + num_false_negatives
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
my_recall = num_true_positives / (couple_number * test_proportion)
return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall}
def load_mds(paths: list) -> list:
if len(paths) == 0:
return []
all_mds = []
# 传入md路径列表
for md_path in paths:
if not os.path.exists(md_path):
continue
mds = []
# 打开每一个md文件
with open(md_path, 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
md_metadata = line.strip().split('\t')
md = eval(md_metadata[0].replace('md:', ''))
confidence = eval(md_metadata[2].replace('confidence:', ''))
if confidence > 0:
mds.append(md)
all_mds.extend(mds)
return all_mds
def is_explicable(line, all_mds: list) -> bool:
attrs = all_mds[0].keys() # 从第一条md中读取所有字段
for md in all_mds:
explicable = True # 假设这条md能解释当前元组
for a in attrs:
threshold = md[a]
if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return True # 任意一条md能解释直接返回
return False # 遍历结束,不能解释
class Classifier:
@property
def configspace(self) -> ConfigurationSpace:
@ -273,7 +207,7 @@ def ml_er_hpo():
classifier.configspace,
deterministic=True,
n_trials=10, # We want to run max 50 trials (combination of config and seed)
n_workers=2
n_workers=1
)
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

@ -8,7 +8,6 @@ sys.path.append('/home/w/PycharmProjects/py_entitymatching/py_entitymatching')
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
import time
import six
from ConfigSpace import Configuration
from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio

@ -0,0 +1,114 @@
import os
import random
import pandas as pd
import Levenshtein
import ml_er.ml_entity_resolver
def my_Levenshtein_ratio(str1, str2):
if max(len(str1), len(str2)) == 0:
return 1
return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
def load_mds(paths: list) -> list:
if len(paths) == 0:
return []
all_mds = []
# 传入md路径列表
for md_path in paths:
if not os.path.exists(md_path):
continue
mds = []
# 打开每一个md文件
with open(md_path, 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
md_metadata = line.strip().split('\t')
md = eval(md_metadata[0].replace('md:', ''))
confidence = eval(md_metadata[2].replace('confidence:', ''))
if confidence > 0:
mds.append(md)
all_mds.extend(mds)
return all_mds
# 输入: md地址列表/预测表地址/随机生成次数
# 输出: 一些正样本(带gold列不带prediction列)
def generate_samples(md_path_list, pred_path, count: int):
all_mds = load_mds(md_path_list)
predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1')
predictions.fillna("", inplace=True)
predictions = predictions.astype(str)
pred_attrs = predictions.columns.values.tolist() # 预测表中的字段,带前缀,包括gold和predict
attrs = [] # 不带前缀的字段,不包括gold和predict
l_attrs = []
r_attrs = []
for _ in pred_attrs:
if _.startswith('ltable_'):
attrs.append(_.replace('ltable_', ''))
l_attrs.append(_)
elif _.startswith('rtable'):
r_attrs.append(_)
fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')]
fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')]
fpl = fp[l_attrs]
fpr = fp[r_attrs]
# 将左右两部分字段名统一
fpl.columns = attrs
fpr.columns = attrs
fnl = fn[l_attrs]
fnr = fn[r_attrs]
fnl.columns = attrs
fnr.columns = attrs
fp = pd.concat([fpl, fpr])
fn = pd.concat([fnl, fnr])
df = pd.concat([fp, fn])
length = len(df)
result = pd.DataFrame()
for i in range(0, count):
dic = {}
for _ in attrs:
if _ == 'id':
index = random.randint(0, length-1)
value = df.iloc[index]['id']
dic['ltable_'+_] = value
dic['rtable_'+_] = value
else:
index1 = random.randint(0, length-1)
index2 = random.randint(0, length-1)
value1 = df.iloc[index1][_]
value2 = df.iloc[index2][_]
dic['ltable_'+_] = value1
dic['rtable_'+_] = value2
for md in all_mds:
satis = True
for _ in attrs:
if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]:
satis = False
break
if satis:
series = pd.Series(dic)
result = result._append(series, ignore_index=True)
result['gold'] = 1
return result
# 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空)
if __name__ == '__main__':
md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt',
'/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt',
'/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt',
'/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt']
pre_p = '/home/w/pred.csv'
generate_samples(md_paths, pre_p, 10000)
# 随机生成次数写个一千一万都没问题
Loading…
Cancel
Save