新增脚本:根据MD生成正样本

2 years ago · 59dc97d2e2
parent b1c0abb664
commit 59dc97d2e2
4 changed files with 117 additions and 70 deletions
--- a/entrance.py
+++ b/entrance.py
@ -23,7 +23,7 @@ if __name__ == '__main__':
    # todo
    #  距离度量用户可设置?
    #  使用drop删除特征向量中的列？(如删除id相关特征)
-    run(1)
+    run(3)  # 迭代3轮
    # ml_er(1)
    # todo 将优化结果与参数输出到文件中
    #  通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@ -1,5 +1,4 @@
 import os
-import time

 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
 from ConfigSpace.conditions import InCondition
@ -8,8 +7,8 @@ import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd

 from smac import HyperparameterOptimizationFacade, Scenario
-from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
 from settings import *
+from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable

 # 数据在外部加载
 ########################################################################################################################
@ -39,71 +38,6 @@ selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字
 ########################################################################################################################


-def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int,
-                        test_proportion: float) -> dict:
-    new_df = df.reset_index(drop=False, inplace=False)
-    gold = new_df[labeled_attr]
-    predicted = new_df[predicted_attr]
-    gold_negative = gold[gold == 0].index.values
-    gold_positive = gold[gold == 1].index.values
-    predicted_negative = predicted[predicted == 0].index.values
-    predicted_positive = predicted[predicted == 1].index.values
-
-    false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
-    true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
-    false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
-
-    num_true_positives = float(len(true_positive_indices))
-    num_false_positives = float(len(false_positive_indices))
-    num_false_negatives = float(len(false_negative_indices))
-
-    precision_denominator = num_true_positives + num_false_positives
-    recall_denominator = num_true_positives + num_false_negatives
-
-    precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
-    recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
-    F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
-    my_recall = num_true_positives / (couple_number * test_proportion)
-
-    return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall}
-
-
-def load_mds(paths: list) -> list:
-    if len(paths) == 0:
-        return []
-    all_mds = []
-    # 传入md路径列表
-    for md_path in paths:
-        if not os.path.exists(md_path):
-            continue
-        mds = []
-        # 打开每一个md文件
-        with open(md_path, 'r') as f:
-            # 读取每一行的md，加入该文件的md列表
-            for line in f.readlines():
-                md_metadata = line.strip().split('\t')
-                md = eval(md_metadata[0].replace('md:', ''))
-                confidence = eval(md_metadata[2].replace('confidence:', ''))
-                if confidence > 0:
-                    mds.append(md)
-        all_mds.extend(mds)
-    return all_mds
-
-
-def is_explicable(line, all_mds: list) -> bool:
-    attrs = all_mds[0].keys()  # 从第一条md中读取所有字段
-    for md in all_mds:
-        explicable = True  # 假设这条md能解释当前元组
-        for a in attrs:
-            threshold = md[a]
-            if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold:
-                explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
-                break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
-        if explicable:
-            return True  # 任意一条md能解释，直接返回
-    return False  # 遍历结束，不能解释
-
-
 class Classifier:
    @property
    def configspace(self) -> ConfigurationSpace:
@ -273,7 +207,7 @@ def ml_er_hpo():
        classifier.configspace,
        deterministic=True,
        n_trials=10,  # We want to run max 50 trials (combination of config and seed)
-        n_workers=2
+        n_workers=1
    )

    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -8,7 +8,6 @@ sys.path.append('/home/w/PycharmProjects/py_entitymatching/py_entitymatching')
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
-import time
 import six
 from ConfigSpace import Configuration
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
--- a/samples_generator.py
+++ b/samples_generator.py
@ -0,0 +1,114 @@
+import os
+import random
+
+import pandas as pd
+import Levenshtein
+
+import ml_er.ml_entity_resolver
+
+
+def my_Levenshtein_ratio(str1, str2):
+    if max(len(str1), len(str2)) == 0:
+        return 1
+    return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
+
+
+def load_mds(paths: list) -> list:
+    if len(paths) == 0:
+        return []
+    all_mds = []
+    # 传入md路径列表
+    for md_path in paths:
+        if not os.path.exists(md_path):
+            continue
+        mds = []
+        # 打开每一个md文件
+        with open(md_path, 'r') as f:
+            # 读取每一行的md，加入该文件的md列表
+            for line in f.readlines():
+                md_metadata = line.strip().split('\t')
+                md = eval(md_metadata[0].replace('md:', ''))
+                confidence = eval(md_metadata[2].replace('confidence:', ''))
+                if confidence > 0:
+                    mds.append(md)
+        all_mds.extend(mds)
+    return all_mds
+
+
+# 输入: md地址列表/预测表地址/随机生成次数
+# 输出: 一些正样本(带gold列不带prediction列)
+def generate_samples(md_path_list, pred_path, count: int):
+    all_mds = load_mds(md_path_list)
+
+    predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1')
+    predictions.fillna("", inplace=True)
+    predictions = predictions.astype(str)
+    pred_attrs = predictions.columns.values.tolist()  # 预测表中的字段,带前缀,包括gold和predict
+    attrs = []  # 不带前缀的字段,不包括gold和predict
+    l_attrs = []
+    r_attrs = []
+    for _ in pred_attrs:
+        if _.startswith('ltable_'):
+            attrs.append(_.replace('ltable_', ''))
+            l_attrs.append(_)
+        elif _.startswith('rtable'):
+            r_attrs.append(_)
+
+    fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')]
+    fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')]
+
+    fpl = fp[l_attrs]
+    fpr = fp[r_attrs]
+    # 将左右两部分字段名统一
+    fpl.columns = attrs
+    fpr.columns = attrs
+    fnl = fn[l_attrs]
+    fnr = fn[r_attrs]
+    fnl.columns = attrs
+    fnr.columns = attrs
+    fp = pd.concat([fpl, fpr])
+    fn = pd.concat([fnl, fnr])
+    df = pd.concat([fp, fn])
+    length = len(df)
+
+    result = pd.DataFrame()
+    for i in range(0, count):
+        dic = {}
+        for _ in attrs:
+            if _ == 'id':
+                index = random.randint(0, length-1)
+                value = df.iloc[index]['id']
+                dic['ltable_'+_] = value
+                dic['rtable_'+_] = value
+            else:
+                index1 = random.randint(0, length-1)
+                index2 = random.randint(0, length-1)
+                value1 = df.iloc[index1][_]
+                value2 = df.iloc[index2][_]
+                dic['ltable_'+_] = value1
+                dic['rtable_'+_] = value2
+
+        for md in all_mds:
+            satis = True
+            for _ in attrs:
+                if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]:
+                    satis = False
+                    break
+            if satis:
+                series = pd.Series(dic)
+                result = result._append(series, ignore_index=True)
+    result['gold'] = 1
+    return result
+
+    # 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空)
+
+
+if __name__ == '__main__':
+    md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt',
+                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt',
+                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt',
+                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt']
+
+    pre_p = '/home/w/pred.csv'
+    generate_samples(md_paths, pre_p, 10000)
+    # 随机生成次数写个一千一万都没问题