新增脚本:根据MD生成正样本

2 years ago · 59dc97d2e2
parent b1c0abb664
commit 59dc97d2e2
4 changed files with 117 additions and 70 deletions
--- a/entrance.py
+++ b/entrance.py
@ -23,7 +23,7 @@ if __name__ == '__main__':
    # todo
    #  距离度量用户可设置?
    #  使用drop删除特征向量中的列？(如删除id相关特征)
-    run(1)
+    run(3)  # 迭代3轮
    # ml_er(1)
    # todo 将优化结果与参数输出到文件中
    #  通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@ -1,5 +1,4 @@
 import os
 import time
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
 from ConfigSpace.conditions import InCondition
@ -8,8 +7,8 @@ import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
 from smac import HyperparameterOptimizationFacade, Scenario
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
 from settings import *
 from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable
 # 数据在外部加载
 ########################################################################################################################
@ -39,71 +38,6 @@ selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字
 ########################################################################################################################
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int,
                        test_proportion: float) -> dict:
    new_df = df.reset_index(drop=False, inplace=False)
    gold = new_df[labeled_attr]
    predicted = new_df[predicted_attr]
    gold_negative = gold[gold == 0].index.values
    gold_positive = gold[gold == 1].index.values
    predicted_negative = predicted[predicted == 0].index.values
    predicted_positive = predicted[predicted == 1].index.values
    false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
    true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
    false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
    num_true_positives = float(len(true_positive_indices))
    num_false_positives = float(len(false_positive_indices))
    num_false_negatives = float(len(false_negative_indices))
    precision_denominator = num_true_positives + num_false_positives
    recall_denominator = num_true_positives + num_false_negatives
    precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
    recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
    F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
    my_recall = num_true_positives / (couple_number * test_proportion)
    return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall}
 def load_mds(paths: list) -> list:
    if len(paths) == 0:
        return []
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
        if not os.path.exists(md_path):
            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
            # 读取每一行的md，加入该文件的md列表
            for line in f.readlines():
                md_metadata = line.strip().split('\t')
                md = eval(md_metadata[0].replace('md:', ''))
                confidence = eval(md_metadata[2].replace('confidence:', ''))
                if confidence > 0:
                    mds.append(md)
        all_mds.extend(mds)
    return all_mds
 def is_explicable(line, all_mds: list) -> bool:
    attrs = all_mds[0].keys()  # 从第一条md中读取所有字段
    for md in all_mds:
        explicable = True  # 假设这条md能解释当前元组
        for a in attrs:
            threshold = md[a]
            if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold:
                explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
        if explicable:
            return True  # 任意一条md能解释，直接返回
    return False  # 遍历结束，不能解释
 class Classifier:
    @property
    def configspace(self) -> ConfigurationSpace:
@ -273,7 +207,7 @@ def ml_er_hpo():
        classifier.configspace,
        deterministic=True,
        n_trials=10,  # We want to run max 50 trials (combination of config and seed)
-        n_workers=2
+        n_workers=1
    )
    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -8,7 +8,6 @@ sys.path.append('/home/w/PycharmProjects/py_entitymatching/py_entitymatching')
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
 import time
 import six
 from ConfigSpace import Configuration
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
--- a/samples_generator.py
+++ b/samples_generator.py
@ -0,0 +1,114 @@
 import os
 import random
 import pandas as pd
 import Levenshtein
 import ml_er.ml_entity_resolver
 def my_Levenshtein_ratio(str1, str2):
    if max(len(str1), len(str2)) == 0:
        return 1
    return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
 def load_mds(paths: list) -> list:
    if len(paths) == 0:
        return []
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
        if not os.path.exists(md_path):
            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
            # 读取每一行的md，加入该文件的md列表
            for line in f.readlines():
                md_metadata = line.strip().split('\t')
                md = eval(md_metadata[0].replace('md:', ''))
                confidence = eval(md_metadata[2].replace('confidence:', ''))
                if confidence > 0:
                    mds.append(md)
        all_mds.extend(mds)
    return all_mds
 # 输入: md地址列表/预测表地址/随机生成次数
 # 输出: 一些正样本(带gold列不带prediction列)
 def generate_samples(md_path_list, pred_path, count: int):
    all_mds = load_mds(md_path_list)
    predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1')
    predictions.fillna("", inplace=True)
    predictions = predictions.astype(str)
    pred_attrs = predictions.columns.values.tolist()  # 预测表中的字段,带前缀,包括gold和predict
    attrs = []  # 不带前缀的字段,不包括gold和predict
    l_attrs = []
    r_attrs = []
    for _ in pred_attrs:
        if _.startswith('ltable_'):
            attrs.append(_.replace('ltable_', ''))
            l_attrs.append(_)
        elif _.startswith('rtable'):
            r_attrs.append(_)
    fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')]
    fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')]
    fpl = fp[l_attrs]
    fpr = fp[r_attrs]
    # 将左右两部分字段名统一
    fpl.columns = attrs
    fpr.columns = attrs
    fnl = fn[l_attrs]
    fnr = fn[r_attrs]
    fnl.columns = attrs
    fnr.columns = attrs
    fp = pd.concat([fpl, fpr])
    fn = pd.concat([fnl, fnr])
    df = pd.concat([fp, fn])
    length = len(df)
    result = pd.DataFrame()
    for i in range(0, count):
        dic = {}
        for _ in attrs:
            if _ == 'id':
                index = random.randint(0, length-1)
                value = df.iloc[index]['id']
                dic['ltable_'+_] = value
                dic['rtable_'+_] = value
            else:
                index1 = random.randint(0, length-1)
                index2 = random.randint(0, length-1)
                value1 = df.iloc[index1][_]
                value2 = df.iloc[index2][_]
                dic['ltable_'+_] = value1
                dic['rtable_'+_] = value2
        for md in all_mds:
            satis = True
            for _ in attrs:
                if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]:
                    satis = False
                    break
            if satis:
                series = pd.Series(dic)
                result = result._append(series, ignore_index=True)
    result['gold'] = 1
    return result
    # 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空)
 if __name__ == '__main__':
    md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt',
                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt',
                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt',
                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt']
    pre_p = '/home/w/pred.csv'
    generate_samples(md_paths, pre_p, 10000)
    # 随机生成次数写个一千一万都没问题