9.24

2 years ago · e18295838a
parent c349768eaf
commit e18295838a
5 changed files with 192 additions and 137 deletions
--- a/entrance.py
+++ b/entrance.py
@ -1,29 +1,28 @@
 # this is the entrance of the auto-ER procedure
 from md_discovery.script.md_discover import md_discover
 from ml_er.ml_entity_resolver import ml_er
 from hpo.er_model_hpo import ml_er_hpo
 from settings import *
-ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
+def run(rounds: int):
-rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
+    hp_config = None
-mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
+    # while The termination condition is not met
-mapping_lid = 'idAmazon'  # mapping表中左表id名
+    iter_round = 1
-mapping_rid = 'idGoogleBase'  # mapping表中右表id名
+    for i in range(0, rounds):
-ltable_id = 'id'  # 左表id字段名称
+        ml_er(iter_round, hp_config)
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
 lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
 similarity_threshold = 0.7
 confidence_threshold = 0.8
 interpretability_weight = 0.3
 def run(l_table_path, r_table_path, mapping_path):
    # while The termination condition is not met:
    while True:
    #     er()
        md_discover()
-    #     hpo()
+        hp_config = ml_er_hpo()
        iter_round += 1
    ml_er(iter_round, hp_config)
    return
 if __name__ == '__main__':
-    # todo 距离度量用户可设置?
+    path = 'md_discovery/output'
    # todo
    #  距离度量用户可设置?
    #  使用drop删除特征向量中的列？(如删除id相关特征)
    run(1)
    # ml_er(1)
    print(ltable_path)
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@ -1,4 +1,5 @@
-from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
+import os
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
 from ConfigSpace.conditions import InCondition
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
@ -6,15 +7,13 @@ import pandas as pd
 from smac import HyperparameterOptimizationFacade, Scenario
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
-from entrance import *
+from settings import *
 # 数据在外部加载
 ########################################################################################################################
 ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
 cm.set_key(ltable, ltable_id)
 ltable.fillna("", inplace=True)
 rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
 cm.set_key(rtable, rtable_id)
 rtable.fillna("", inplace=True)
 mappings = pd.read_csv(mapping_path)
@ -32,21 +31,12 @@ for index, row in mappings.iterrows():
 # 仅保留两表中出现在映射表中的行，增大正样本比例
 selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
 selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
 tables_id = rtable_id  # 不论左表右表ID字段名是否一致，经上一行调整，统一以右表为准
 selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
 selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
 attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
 attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
 cm.set_key(selected_ltable, ltable_id)
 cm.set_key(selected_rtable, rtable_id)
 ########################################################################################################################
 def test_test():
    block_attr_items = selected_attrs[:]
    block_attr_items.remove(rtable_id)
    print(block_attr_items)
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
                        test_proportion: float) -> dict:
    new_df = df.reset_index(drop=False, inplace=False)
@ -82,6 +72,8 @@ def load_mds(paths: list) -> list:
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
        if not os.path.exists(md_path):
            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
@ -102,7 +94,7 @@ def is_explicable(row, all_mds: list) -> bool:
        explicable = True  # 假设这条md能解释当前元组
        for a in attrs:
            threshold = md[a]
-            if my_Levenshtein_ratio(str(getattr(row, 'ltable_'+a)), str(getattr(row, 'rtable_'+a))) < threshold:
+            if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold:
                explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
        if explicable:
@ -116,13 +108,12 @@ class Classifier:
        # Build Configuration Space which defines all parameters and their ranges
        cs = ConfigurationSpace(seed=0)
        block_attr_items = selected_attrs[:]
-        block_attr_items.remove(rtable_id)
+        block_attr_items.remove(tables_id)
        block_attr = Categorical("block_attr", block_attr_items)
        overlap_size = Integer("overlap_size", (1, 3), default=1)
        ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
        ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
        # todo 其他可调参数(如feature table删去某列)
        use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
        cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
@ -131,6 +122,11 @@ class Classifier:
    # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
    def train(self, config: Configuration, seed: int = 0) -> float:
        attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]  # 字段名加左前缀
        attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]  # 字段名加右前缀
        cm.set_key(selected_ltable, tables_id)
        cm.set_key(selected_rtable, tables_id)
        if config["ml_blocker"] == "over_lap":
            blocker = em.OverlapBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
@ -145,13 +141,13 @@ class Classifier:
        candidate_match_rows = []
        for index, row in candidate.iterrows():
-            l_id = row['ltable_' + ltable_id]
+            l_id = row['ltable_' + tables_id]
            map_row = mappings[mappings[mapping_lid] == l_id]
            if map_row is not None:
                r_id = map_row[mapping_rid]
                for value in r_id:
-                    if value == row['rtable_' + rtable_id]:
+                    if value == row['rtable_' + tables_id]:
                        candidate_match_rows.append(row["_id"])
            else:
                continue
@ -165,9 +161,12 @@ class Classifier:
            candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
        # 拼接正负样本
        candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
        if len(candidate_for_train_test) == 0:
            return 1
        cm.set_key(candidate_for_train_test, '_id')
-        cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
+        cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
-        cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
+        cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
        cm.set_ltable(candidate_for_train_test, selected_ltable)
        cm.set_rtable(candidate_for_train_test, selected_rtable)
@ -178,7 +177,18 @@ class Classifier:
        train_set = sets['train']
        test_set = sets['test']
-        matcher = None
+        cm.set_key(train_set, '_id')
        cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
        cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
        cm.set_ltable(train_set, selected_ltable)
        cm.set_rtable(train_set, selected_rtable)
        cm.set_key(test_set, '_id')
        cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
        cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
        cm.set_ltable(test_set, selected_ltable)
        cm.set_rtable(test_set, selected_rtable)
        if config["ml_matcher"] == "dt":
            matcher = em.DTMatcher(name='DecisionTree', random_state=0)
        elif config["ml_matcher"] == "svm":
@ -198,25 +208,21 @@ class Classifier:
                                                     attrs_after=['gold'],
                                                     show_progress=False)
-        # todo 属性名解耦
+        test_feature_after = attrs_with_l_prefix[:]
        test_feature_after.extend(attrs_with_r_prefix)
        for _ in test_feature_after:
            if _.endswith(tables_id):
                test_feature_after.remove(_)
        test_feature_after.append('gold')
        test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-                                                    attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer',
+                                                    attrs_after=test_feature_after, show_progress=False)
-                                                                 'ltable_price', 'rtable_name', 'rtable_description',
+
-                                                                 'rtable_manufacturer', 'rtable_price', 'gold'], show_progress=False)
+        fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
-
+        matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
-        # todo 参数可调 用drop删除特征向量中的列？
+
-        # 1.exclude_attrs
+        test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
-        # 去掉id相关的相似度
+        predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
-        matcher.fit(table=train_feature_vecs,
+                                      append=True, target_attr='predicted', inplace=False)
               exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
               target_attr='gold')
        # 1.exclude_attrs
        predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_title',
                                                                         'ltable_description', 'ltable_manufacturer',
                                                                         'ltable_price', 'rtable_name', 'rtable_description',
                                                                         'rtable_manufacturer', 'rtable_price', 'gold'],
                                 append=True, target_attr='predicted', inplace=False)
        eval_result = em.eval_matches(predictions, 'gold', 'predicted')
        em.print_eval_summary(eval_result)
        indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, test_proportion)
@ -231,34 +237,42 @@ class Classifier:
        #  默认路径为 "../md_discovery/output/xxx.txt"
        #  真阳/假阴  mds/vio  共4个md文件
-        md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt',
+        md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
-                    '../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt']
+                    'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
        epl_match = 0  # 可解释，预测match
        nepl_mismatch = 0  # 不可解释，预测mismatch
        md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
-        for row in predictions.itertuples():
+        if len(md_list) > 0:
-            if is_explicable(row, md_list):
+            for row in predictions.itertuples():
-                if getattr(row, 'predicted') == 1:
+                if is_explicable(row, md_list):
-                    epl_match += 1
+                    if getattr(row, 'predicted') == 1:
-            else:
+                        epl_match += 1
-                if getattr(row, 'predicted') == 0:
+                else:
-                    nepl_mismatch += 1
+                    if getattr(row, 'predicted') == 0:
-        epl_ability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
+                        nepl_mismatch += 1
-        f1 = indicators['F1']
+        interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
-        performance = interpretability_weight * epl_ability + (1 - interpretability_weight) * f1
+        # if indicators["my_recall"] >= 0.8:
        #     f1 = indicators["F1"]
        # else:
        #     f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
        if indicators["my_recall"] < 0.8:
            return 1
        f1 = indicators["F1"]
        performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
        return 1 - performance
-if __name__ == "__main__":
+def ml_er_hpo():
    classifier = Classifier()
    # Next, we create an object, holding general information about the run
    scenario = Scenario(
        classifier.configspace,
-        n_trials=12,  # We want to run max 50 trials (combination of config and seed)
+        deterministic=True,
        n_trials=10,  # We want to run max 50 trials (combination of config and seed)
    )
-    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=3)
+    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
    # Now we use SMAC to find the best hyperparameters
    smac = HyperparameterOptimizationFacade(
@ -268,9 +282,6 @@ if __name__ == "__main__":
        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
    )
    # todo
    #  如果new_recall过低则避免其成为最优解
    #  将损失函数置为1/用new_recall降低F1从而提高损失函数
    incumbent = smac.optimize()
    # Get cost of default configuration
@ -280,6 +291,6 @@ if __name__ == "__main__":
    # Let's calculate the cost of the incumbent
    incumbent_cost = smac.validate(incumbent)
    print(f"Incumbent cost: {incumbent_cost}")
    print(f"Configuration:{incumbent.values()}")
-    print(f"MAX_F1:{1-classifier.train(incumbent)}")
+
    return incumbent
--- a/md_discovery/script/md_discover.py
+++ b/md_discovery/script/md_discover.py
@ -1,7 +1,7 @@
 import time
 from md_discovery.functions.multi_process_infer_by_pairs import inference_from_record_pairs
 from md_discovery.functions.multi_process_infer_by_pairs import get_mds_metadata
-from entrance import *
+from settings import *
 # # 若不输出support和confidence，使用以下两块代码
 # # 将列表1写入本地，路径需自己修改
@ -19,8 +19,8 @@ from entrance import *
 def md_discover():
    # 目前可以仿照这个main函数写
-    tp_single_tuple_path = "../../ml_er/output/tp_single_tuple.csv"
+    tp_single_tuple_path = "ml_er/output/tp_single_tuple.csv"
-    fn_single_tuple_path = "../../ml_er/output/fn_single_tuple.csv"
+    fn_single_tuple_path = "ml_er/output/fn_single_tuple.csv"
    # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
    # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
    # 例如此处输入参数要求md左侧相似度字段至少为0.7，右侧指向'id'字段
@ -36,8 +36,8 @@ def md_discover():
    # 若输出support和confidence，使用以下两块代码
    # 将列表1写入本地，路径需自己修改
-    tp_mds_path = "../output/tp_mds.txt"
+    tp_mds_path = "md_discovery/output/tp_mds.txt"
-    tp_vio_path = "../output/tp_vio.txt"
+    tp_vio_path = "md_discovery/output/tp_vio.txt"
    with open(tp_mds_path, 'w') as f:
        for _ in tp_mds_meta:
@ -51,8 +51,8 @@ def md_discover():
                f.write(i + ':' + str(_[i]) + '\t')
            f.write('\n')
-    fn_mds_path = "../output/fn_mds.txt"
+    fn_mds_path = "md_discovery/output/fn_mds.txt"
-    fn_vio_path = "../output/fn_vio.txt"
+    fn_vio_path = "md_discovery/output/fn_vio.txt"
    with open(fn_mds_path, 'w') as f:
        for _ in fn_mds_meta:
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -1,3 +1,4 @@
 import os
 import sys
 from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
@ -11,19 +12,18 @@ import time
 import six
 from ConfigSpace import Configuration
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
-from entrance import *
+from settings import *
 from hpo.magellan_hpo import incumbent
-def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "output/tp_single_tuple.csv", fn_single_tuple_path: str = "output/fn_single_tuple.csv"):
+def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "ml_er/output/tp_single_tuple.csv", fn_single_tuple_path: str = "ml_er/output/fn_single_tuple.csv"):
    # 提取预测表中真阳和假阴部分
    tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
    fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
    # 将真阳/假阴表中左右ID调整一致
    for index, row in tp.iterrows():
-        tp.loc[index, "rtable_id"] = row["ltable_id"]
+        tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
    for index, row in fn.iterrows():
-        fn.loc[index, "rtable_id"] = row["ltable_id"]
+        fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
    pred_columns = pred.columns.values.tolist()
    l_columns = []
@ -93,6 +93,8 @@ def load_mds(paths: list) -> list:
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
        if not os.path.exists(md_path):
            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
@ -137,14 +139,10 @@ def load_data(left_path: str, right_path: str, mapping_path: str):
    return left, right, mapping
-def ml_er(config: Configuration):
+def ml_er(iter_round: int, config: Configuration = None, ):
    # todo:
    #  if config is not None -> load configs
-    #  else use default configs
+    #  else -> use default configs
    #  1. block_attr
    #  2. overlap_size
    #  3. ml_matcher
    #  4. ml_blocker
    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
    cm.set_key(ltable, ltable_id)
    ltable.fillna("", inplace=True)
@ -168,28 +166,58 @@ def ml_er(config: Configuration):
    selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
    selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
    tables_id = rtable_id
    selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
    selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
    items_but_id = selected_attrs[:]
    items_but_id.remove(tables_id)  # 两张表中除了id的字段名
    attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
    attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
-    cm.set_key(selected_ltable, ltable_id)
+    cm.set_key(selected_ltable, tables_id)
-    cm.set_key(selected_rtable, rtable_id)
+    cm.set_key(selected_rtable, tables_id)
    if config is not None:
        ml_matcher = config["ml_matcher"]
        if ml_matcher == "dt":
            matcher = em.DTMatcher(name='DecisionTree', random_state=0)
        elif ml_matcher == "svm":
            matcher = em.SVMMatcher(name='SVM', random_state=0)
        elif ml_matcher == "rf":
            matcher = em.RFMatcher(name='RF', random_state=0)
        elif ml_matcher == "lg":
            matcher = em.LogRegMatcher(name='LogReg', random_state=0)
        elif ml_matcher == "ln":
            matcher = em.LinRegMatcher(name='LinReg')
        elif ml_matcher == "nb":
            matcher = em.NBMatcher(name='NaiveBayes')
        if config["ml_blocker"] == "over_lap":
            blocker = em.OverlapBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
                                             overlap_size=config["overlap_size"], show_progress=False)
        elif config["ml_blocker"] == "attr_equiv":
            blocker = em.AttrEquivalenceBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1)
    else:
        matcher = em.RFMatcher(name='RF', random_state=0)
        blocker = em.OverlapBlocker()
        candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0],
                                         l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
                                         overlap_size=1, show_progress=False)
    blocker = em.OverlapBlocker()
    candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name',
                                     l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
                                     overlap_size=1, show_progress=False)
    candidate['gold'] = 0
    candidate_match_rows = []
    for index, row in candidate.iterrows():
-        l_id = row['ltable_' + ltable_id]
+        l_id = row['ltable_' + tables_id]
        map_row = mappings[mappings[mapping_lid] == l_id]
        if map_row is not None:
            r_id = map_row[mapping_rid]
            for value in r_id:
-                if value == row['rtable_' + rtable_id]:
+                if value == row['rtable_' + tables_id]:
                    candidate_match_rows.append(row["_id"])
        else:
            continue
@ -204,19 +232,18 @@ def ml_er(config: Configuration):
    # 拼接正负样本
    candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
    cm.set_key(candidate_for_train_test, '_id')
-    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
+    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
-    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
+    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
    cm.set_ltable(candidate_for_train_test, selected_ltable)
    cm.set_rtable(candidate_for_train_test, selected_rtable)
    # 分为训练测试集
    train_proportion = 0.7
    test_proportion = 0.3
-    sets = em.split_train_test(candidate_for_train_test, train_proportion=0.7, random_state=0)
+    sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
    train_set = sets['train']
    test_set = sets['test']
    rf = em.RFMatcher(name='RF', random_state=0)
    feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
    train_feature_vecs = em.extract_feature_vecs(train_set,
@ -224,20 +251,19 @@ def ml_er(config: Configuration):
                                                 attrs_after=['gold'],
                                                 show_progress=False)
    test_feature_after = attrs_with_l_prefix[:]
    test_feature_after.extend(attrs_with_r_prefix)
    for _ in test_feature_after:
        if _.endswith(tables_id):
            test_feature_after.remove(_)
    test_feature_after.append('gold')
    test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-                                                attrs_after=['ltable_name', 'ltable_description', 'ltable_manufacturer',
+                                                attrs_after=test_feature_after, show_progress=False)
-                                                             'ltable_price', 'rtable_name', 'rtable_description',
+
-                                                             'rtable_manufacturer', 'rtable_price', 'gold'],
+    fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
-                                                show_progress=False)
+    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
-
+    test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
-    rf.fit(table=train_feature_vecs,
+    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
           exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
           target_attr='gold')
    predictions = rf.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_name',
                                                                     'ltable_description', 'ltable_manufacturer',
                                                                     'ltable_price', 'rtable_name',
                                                                     'rtable_description',
                                                                     'rtable_manufacturer', 'rtable_price', 'gold'],
                             append=True, target_attr='predicted', inplace=False)
    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
    em.print_eval_summary(eval_result)
@ -252,28 +278,35 @@ def ml_er(config: Configuration):
    predictions_attrs.extend(['gold', 'predicted'])
    predictions = predictions[predictions_attrs]
    md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
                'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
    epl_match = 0  # 可解释，预测match
    nepl_mismatch = 0  # 不可解释，预测mismatch
    p_md = "/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt"
    p_vio = "/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt"
    md_paths: list = [p_md, p_vio]
    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
    for row in predictions.itertuples():
        if is_explicable(row, md_list):
            if getattr(row, 'predicted') == 1:
                epl_match += 1
        else:
            if getattr(row, 'predicted') == 0:
                nepl_mismatch += 1
-    epl_ability = (epl_match + nepl_mismatch) / len(predictions)
+    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
    if len(md_list) > 0:
        for row in predictions.itertuples():
            if is_explicable(row, md_list):
                if getattr(row, 'predicted') == 1:
                    epl_match += 1
            else:
                if getattr(row, 'predicted') == 0:
                    nepl_mismatch += 1
    interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
    if indicators["my_recall"] >= 0.8:
        f1 = indicators["F1"]
    else:
        f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
    performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
    ################################################################################################################
    process_prediction_for_md_discovery(predictions)
-    output_path = "output/eval_result" + str(time.time()) + ".txt"
+    output_path = "ml_er/output/eval_result_" + str(iter_round) + ".txt"
    with open(output_path, 'w') as f:
        for key, value in six.iteritems(_get_metric(eval_result)):
            f.write(key + " : " + value)
            f.write('\n')
-        f.write('my_recall:' + str(indicators["my_recall"]))
+        f.write('my_recall:' + str(indicators["my_recall"]) + '\n')
-        f.write('\n')
+        f.write('interpretability:' + str(interpretability) + '\n')
        f.write('performance:' + str(performance) + '\n')
--- a/settings.py
+++ b/settings.py
@ -0,0 +1,12 @@
 ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
 rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
 mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
 mapping_lid = 'idAmazon'  # mapping表中左表id名
 mapping_rid = 'idGoogleBase'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
 lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
 similarity_threshold = 0.7
 confidence_threshold = 0.8
 interpre_weight = 0.3  # 可解释性权重