diff --git a/entrance.py b/entrance.py index d51bf8a..f441733 100644 --- a/entrance.py +++ b/entrance.py @@ -23,7 +23,7 @@ if __name__ == '__main__': # todo # 距离度量用户可设置? # 使用drop删除特征向量中的列?(如删除id相关特征) - run(1) + run(3) # 迭代3轮 # ml_er(1) # todo 将优化结果与参数输出到文件中 # 通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息 diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py index 81c1c76..a1437e5 100644 --- a/hpo/er_model_hpo.py +++ b/hpo/er_model_hpo.py @@ -1,5 +1,4 @@ import os -import time from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer from ConfigSpace.conditions import InCondition @@ -8,8 +7,8 @@ import py_entitymatching.catalog.catalog_manager as cm import pandas as pd from smac import HyperparameterOptimizationFacade, Scenario -from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio from settings import * +from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable # 数据在外部加载 ######################################################################################################################## @@ -39,71 +38,6 @@ selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字 ######################################################################################################################## -def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int, - test_proportion: float) -> dict: - new_df = df.reset_index(drop=False, inplace=False) - gold = new_df[labeled_attr] - predicted = new_df[predicted_attr] - gold_negative = gold[gold == 0].index.values - gold_positive = gold[gold == 1].index.values - predicted_negative = predicted[predicted == 0].index.values - predicted_positive = predicted[predicted == 1].index.values - - false_positive_indices = list(set(gold_negative).intersection(predicted_positive)) - true_positive_indices = list(set(gold_positive).intersection(predicted_positive)) - false_negative_indices = list(set(gold_positive).intersection(predicted_negative)) - - num_true_positives = float(len(true_positive_indices)) - num_false_positives = float(len(false_positive_indices)) - num_false_negatives = float(len(false_negative_indices)) - - precision_denominator = num_true_positives + num_false_positives - recall_denominator = num_true_positives + num_false_negatives - - precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator - recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator - F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall) - my_recall = num_true_positives / (couple_number * test_proportion) - - return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall} - - -def load_mds(paths: list) -> list: - if len(paths) == 0: - return [] - all_mds = [] - # 传入md路径列表 - for md_path in paths: - if not os.path.exists(md_path): - continue - mds = [] - # 打开每一个md文件 - with open(md_path, 'r') as f: - # 读取每一行的md,加入该文件的md列表 - for line in f.readlines(): - md_metadata = line.strip().split('\t') - md = eval(md_metadata[0].replace('md:', '')) - confidence = eval(md_metadata[2].replace('confidence:', '')) - if confidence > 0: - mds.append(md) - all_mds.extend(mds) - return all_mds - - -def is_explicable(line, all_mds: list) -> bool: - attrs = all_mds[0].keys() # 从第一条md中读取所有字段 - for md in all_mds: - explicable = True # 假设这条md能解释当前元组 - for a in attrs: - threshold = md[a] - if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold: - explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 - break # 不再与当前md的其他相似度阈值比较,跳转到下一条md - if explicable: - return True # 任意一条md能解释,直接返回 - return False # 遍历结束,不能解释 - - class Classifier: @property def configspace(self) -> ConfigurationSpace: @@ -273,7 +207,7 @@ def ml_er_hpo(): classifier.configspace, deterministic=True, n_trials=10, # We want to run max 50 trials (combination of config and seed) - n_workers=2 + n_workers=1 ) initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py index 1063eca..f766c8c 100644 --- a/ml_er/ml_entity_resolver.py +++ b/ml_er/ml_entity_resolver.py @@ -8,7 +8,6 @@ sys.path.append('/home/w/PycharmProjects/py_entitymatching/py_entitymatching') import py_entitymatching as em import py_entitymatching.catalog.catalog_manager as cm import pandas as pd -import time import six from ConfigSpace import Configuration from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio diff --git a/samples_generator.py b/samples_generator.py new file mode 100644 index 0000000..0b15a09 --- /dev/null +++ b/samples_generator.py @@ -0,0 +1,114 @@ +import os +import random + +import pandas as pd +import Levenshtein + +import ml_er.ml_entity_resolver + + +def my_Levenshtein_ratio(str1, str2): + if max(len(str1), len(str2)) == 0: + return 1 + return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) + + +def load_mds(paths: list) -> list: + if len(paths) == 0: + return [] + all_mds = [] + # 传入md路径列表 + for md_path in paths: + if not os.path.exists(md_path): + continue + mds = [] + # 打开每一个md文件 + with open(md_path, 'r') as f: + # 读取每一行的md,加入该文件的md列表 + for line in f.readlines(): + md_metadata = line.strip().split('\t') + md = eval(md_metadata[0].replace('md:', '')) + confidence = eval(md_metadata[2].replace('confidence:', '')) + if confidence > 0: + mds.append(md) + all_mds.extend(mds) + return all_mds + + +# 输入: md地址列表/预测表地址/随机生成次数 +# 输出: 一些正样本(带gold列不带prediction列) +def generate_samples(md_path_list, pred_path, count: int): + all_mds = load_mds(md_path_list) + + predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1') + predictions.fillna("", inplace=True) + predictions = predictions.astype(str) + pred_attrs = predictions.columns.values.tolist() # 预测表中的字段,带前缀,包括gold和predict + attrs = [] # 不带前缀的字段,不包括gold和predict + l_attrs = [] + r_attrs = [] + for _ in pred_attrs: + if _.startswith('ltable_'): + attrs.append(_.replace('ltable_', '')) + l_attrs.append(_) + elif _.startswith('rtable'): + r_attrs.append(_) + + fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')] + fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')] + + fpl = fp[l_attrs] + fpr = fp[r_attrs] + # 将左右两部分字段名统一 + fpl.columns = attrs + fpr.columns = attrs + fnl = fn[l_attrs] + fnr = fn[r_attrs] + fnl.columns = attrs + fnr.columns = attrs + fp = pd.concat([fpl, fpr]) + fn = pd.concat([fnl, fnr]) + df = pd.concat([fp, fn]) + length = len(df) + + result = pd.DataFrame() + for i in range(0, count): + dic = {} + for _ in attrs: + if _ == 'id': + index = random.randint(0, length-1) + value = df.iloc[index]['id'] + dic['ltable_'+_] = value + dic['rtable_'+_] = value + else: + index1 = random.randint(0, length-1) + index2 = random.randint(0, length-1) + value1 = df.iloc[index1][_] + value2 = df.iloc[index2][_] + dic['ltable_'+_] = value1 + dic['rtable_'+_] = value2 + + for md in all_mds: + satis = True + for _ in attrs: + if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]: + satis = False + break + if satis: + series = pd.Series(dic) + result = result._append(series, ignore_index=True) + result['gold'] = 1 + return result + + # 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空) + + +if __name__ == '__main__': + md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt', + '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt', + '/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt', + '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt'] + + pre_p = '/home/w/pred.csv' + generate_samples(md_paths, pre_p, 10000) + # 随机生成次数写个一千一万都没问题