9.24

2 years ago · e18295838a
parent c349768eaf
commit e18295838a
5 changed files with 192 additions and 137 deletions
--- a/entrance.py
+++ b/entrance.py
@ -1,29 +1,28 @@
 # this is the entrance of the auto-ER procedure
 from md_discovery.script.md_discover import md_discover
+from ml_er.ml_entity_resolver import ml_er
+from hpo.er_model_hpo import ml_er_hpo
+from settings import *


-ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
-rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
-mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
-mapping_lid = 'idAmazon'  # mapping表中左表id名
-mapping_rid = 'idGoogleBase'  # mapping表中右表id名
-ltable_id = 'id'  # 左表id字段名称
-rtable_id = 'id'  # 右表id字段名称
-target_attr = 'id'  # 进行md挖掘时的目标字段
-lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
-similarity_threshold = 0.7
-confidence_threshold = 0.8
-interpretability_weight = 0.3
-
-def run(l_table_path, r_table_path, mapping_path):
-    # while The termination condition is not met:
-    while True:
-    #     er()
+def run(rounds: int):
+    hp_config = None
+    # while The termination condition is not met
+    iter_round = 1
+    for i in range(0, rounds):
+        ml_er(iter_round, hp_config)
        md_discover()
-    #     hpo()
+        hp_config = ml_er_hpo()
+        iter_round += 1
+    ml_er(iter_round, hp_config)
    return


 if __name__ == '__main__':
-    # todo 距离度量用户可设置?
+    path = 'md_discovery/output'
+    # todo
+    #  距离度量用户可设置?
+    #  使用drop删除特征向量中的列？(如删除id相关特征)
+    run(1)
+    # ml_er(1)
    print(ltable_path)
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@ -1,4 +1,5 @@
-from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
+import os
+from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
 from ConfigSpace.conditions import InCondition
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
@ -6,15 +7,13 @@ import pandas as pd

 from smac import HyperparameterOptimizationFacade, Scenario
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
-from entrance import *
+from settings import *

 # 数据在外部加载
 ########################################################################################################################
 ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
-cm.set_key(ltable, ltable_id)
 ltable.fillna("", inplace=True)
 rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
-cm.set_key(rtable, rtable_id)
 rtable.fillna("", inplace=True)
 mappings = pd.read_csv(mapping_path)

@ -32,21 +31,12 @@ for index, row in mappings.iterrows():
 # 仅保留两表中出现在映射表中的行，增大正样本比例
 selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
 selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
+tables_id = rtable_id  # 不论左表右表ID字段名是否一致，经上一行调整，统一以右表为准
 selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
 selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
-attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
-attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
-cm.set_key(selected_ltable, ltable_id)
-cm.set_key(selected_rtable, rtable_id)
 ########################################################################################################################


-def test_test():
-    block_attr_items = selected_attrs[:]
-    block_attr_items.remove(rtable_id)
-    print(block_attr_items)
-
-
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
                        test_proportion: float) -> dict:
    new_df = df.reset_index(drop=False, inplace=False)
@ -82,6 +72,8 @@ def load_mds(paths: list) -> list:
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
+        if not os.path.exists(md_path):
+            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
@ -102,7 +94,7 @@ def is_explicable(row, all_mds: list) -> bool:
        explicable = True  # 假设这条md能解释当前元组
        for a in attrs:
            threshold = md[a]
-            if my_Levenshtein_ratio(str(getattr(row, 'ltable_'+a)), str(getattr(row, 'rtable_'+a))) < threshold:
+            if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold:
                explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
        if explicable:
@ -116,13 +108,12 @@ class Classifier:
        # Build Configuration Space which defines all parameters and their ranges
        cs = ConfigurationSpace(seed=0)
        block_attr_items = selected_attrs[:]
-        block_attr_items.remove(rtable_id)
+        block_attr_items.remove(tables_id)

        block_attr = Categorical("block_attr", block_attr_items)
        overlap_size = Integer("overlap_size", (1, 3), default=1)
        ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
        ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
-        # todo 其他可调参数(如feature table删去某列)

        use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
        cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
@ -131,6 +122,11 @@ class Classifier:

    # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
    def train(self, config: Configuration, seed: int = 0) -> float:
+
+        attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]  # 字段名加左前缀
+        attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]  # 字段名加右前缀
+        cm.set_key(selected_ltable, tables_id)
+        cm.set_key(selected_rtable, tables_id)
        if config["ml_blocker"] == "over_lap":
            blocker = em.OverlapBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
@ -145,13 +141,13 @@ class Classifier:

        candidate_match_rows = []
        for index, row in candidate.iterrows():
-            l_id = row['ltable_' + ltable_id]
+            l_id = row['ltable_' + tables_id]
            map_row = mappings[mappings[mapping_lid] == l_id]

            if map_row is not None:
                r_id = map_row[mapping_rid]
                for value in r_id:
-                    if value == row['rtable_' + rtable_id]:
+                    if value == row['rtable_' + tables_id]:
                        candidate_match_rows.append(row["_id"])
            else:
                continue
@ -165,9 +161,12 @@ class Classifier:
            candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
        # 拼接正负样本
        candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
+        if len(candidate_for_train_test) == 0:
+            return 1
+
        cm.set_key(candidate_for_train_test, '_id')
-        cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
-        cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
+        cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
+        cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
        cm.set_ltable(candidate_for_train_test, selected_ltable)
        cm.set_rtable(candidate_for_train_test, selected_rtable)

@ -178,7 +177,18 @@ class Classifier:
        train_set = sets['train']
        test_set = sets['test']

-        matcher = None
+        cm.set_key(train_set, '_id')
+        cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
+        cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
+        cm.set_ltable(train_set, selected_ltable)
+        cm.set_rtable(train_set, selected_rtable)
+
+        cm.set_key(test_set, '_id')
+        cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
+        cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
+        cm.set_ltable(test_set, selected_ltable)
+        cm.set_rtable(test_set, selected_rtable)
+
        if config["ml_matcher"] == "dt":
            matcher = em.DTMatcher(name='DecisionTree', random_state=0)
        elif config["ml_matcher"] == "svm":
@ -198,24 +208,20 @@ class Classifier:
                                                     attrs_after=['gold'],
                                                     show_progress=False)

-        # todo 属性名解耦
+        test_feature_after = attrs_with_l_prefix[:]
+        test_feature_after.extend(attrs_with_r_prefix)
+        for _ in test_feature_after:
+            if _.endswith(tables_id):
+                test_feature_after.remove(_)
+        test_feature_after.append('gold')
        test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-                                                    attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer',
-                                                                 'ltable_price', 'rtable_name', 'rtable_description',
-                                                                 'rtable_manufacturer', 'rtable_price', 'gold'], show_progress=False)
-
-        # todo 参数可调 用drop删除特征向量中的列？
-        # 1.exclude_attrs
-        # 去掉id相关的相似度
-        matcher.fit(table=train_feature_vecs,
-               exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
-               target_attr='gold')
-
-        # 1.exclude_attrs
-        predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_title',
-                                                                         'ltable_description', 'ltable_manufacturer',
-                                                                         'ltable_price', 'rtable_name', 'rtable_description',
-                                                                         'rtable_manufacturer', 'rtable_price', 'gold'],
+                                                    attrs_after=test_feature_after, show_progress=False)
+
+        fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
+        matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
+
+        test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
+        predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
                                      append=True, target_attr='predicted', inplace=False)
        eval_result = em.eval_matches(predictions, 'gold', 'predicted')
        em.print_eval_summary(eval_result)
@ -231,11 +237,12 @@ class Classifier:

        #  默认路径为 "../md_discovery/output/xxx.txt"
        #  真阳/假阴  mds/vio  共4个md文件
-        md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt',
-                    '../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt']
+        md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
+                    'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
        epl_match = 0  # 可解释，预测match
        nepl_mismatch = 0  # 不可解释，预测mismatch
        md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
+        if len(md_list) > 0:
            for row in predictions.itertuples():
                if is_explicable(row, md_list):
                    if getattr(row, 'predicted') == 1:
@ -243,22 +250,29 @@ class Classifier:
                else:
                    if getattr(row, 'predicted') == 0:
                        nepl_mismatch += 1
-        epl_ability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
-        f1 = indicators['F1']
-        performance = interpretability_weight * epl_ability + (1 - interpretability_weight) * f1
+        interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
+        # if indicators["my_recall"] >= 0.8:
+        #     f1 = indicators["F1"]
+        # else:
+        #     f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
+        if indicators["my_recall"] < 0.8:
+            return 1
+        f1 = indicators["F1"]
+        performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
        return 1 - performance


-if __name__ == "__main__":
+def ml_er_hpo():
    classifier = Classifier()

    # Next, we create an object, holding general information about the run
    scenario = Scenario(
        classifier.configspace,
-        n_trials=12,  # We want to run max 50 trials (combination of config and seed)
+        deterministic=True,
+        n_trials=10,  # We want to run max 50 trials (combination of config and seed)
    )

-    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=3)
+    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

    # Now we use SMAC to find the best hyperparameters
    smac = HyperparameterOptimizationFacade(
@ -268,9 +282,6 @@ if __name__ == "__main__":
        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
    )

-    # todo
-    #  如果new_recall过低则避免其成为最优解
-    #  将损失函数置为1/用new_recall降低F1从而提高损失函数
    incumbent = smac.optimize()

    # Get cost of default configuration
@ -280,6 +291,6 @@ if __name__ == "__main__":
    # Let's calculate the cost of the incumbent
    incumbent_cost = smac.validate(incumbent)
    print(f"Incumbent cost: {incumbent_cost}")
-
    print(f"Configuration:{incumbent.values()}")
-    print(f"MAX_F1:{1-classifier.train(incumbent)}")
+
+    return incumbent
--- a/md_discovery/script/md_discover.py
+++ b/md_discovery/script/md_discover.py
@ -1,7 +1,7 @@
 import time
 from md_discovery.functions.multi_process_infer_by_pairs import inference_from_record_pairs
 from md_discovery.functions.multi_process_infer_by_pairs import get_mds_metadata
-from entrance import *
+from settings import *

 # # 若不输出support和confidence，使用以下两块代码
 # # 将列表1写入本地，路径需自己修改
@ -19,8 +19,8 @@ from entrance import *

 def md_discover():
    # 目前可以仿照这个main函数写
-    tp_single_tuple_path = "../../ml_er/output/tp_single_tuple.csv"
-    fn_single_tuple_path = "../../ml_er/output/fn_single_tuple.csv"
+    tp_single_tuple_path = "ml_er/output/tp_single_tuple.csv"
+    fn_single_tuple_path = "ml_er/output/fn_single_tuple.csv"
    # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
    # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
    # 例如此处输入参数要求md左侧相似度字段至少为0.7，右侧指向'id'字段
@ -36,8 +36,8 @@ def md_discover():

    # 若输出support和confidence，使用以下两块代码
    # 将列表1写入本地，路径需自己修改
-    tp_mds_path = "../output/tp_mds.txt"
-    tp_vio_path = "../output/tp_vio.txt"
+    tp_mds_path = "md_discovery/output/tp_mds.txt"
+    tp_vio_path = "md_discovery/output/tp_vio.txt"
    
    with open(tp_mds_path, 'w') as f:
        for _ in tp_mds_meta:
@ -51,8 +51,8 @@ def md_discover():
                f.write(i + ':' + str(_[i]) + '\t')
            f.write('\n')

-    fn_mds_path = "../output/fn_mds.txt"
-    fn_vio_path = "../output/fn_vio.txt"
+    fn_mds_path = "md_discovery/output/fn_mds.txt"
+    fn_vio_path = "md_discovery/output/fn_vio.txt"

    with open(fn_mds_path, 'w') as f:
        for _ in fn_mds_meta:
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -1,3 +1,4 @@
+import os
 import sys

 from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
@ -11,19 +12,18 @@ import time
 import six
 from ConfigSpace import Configuration
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
-from entrance import *
-from hpo.magellan_hpo import incumbent
+from settings import *


-def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "output/tp_single_tuple.csv", fn_single_tuple_path: str = "output/fn_single_tuple.csv"):
+def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "ml_er/output/tp_single_tuple.csv", fn_single_tuple_path: str = "ml_er/output/fn_single_tuple.csv"):
    # 提取预测表中真阳和假阴部分
    tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
    fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
    # 将真阳/假阴表中左右ID调整一致
    for index, row in tp.iterrows():
-        tp.loc[index, "rtable_id"] = row["ltable_id"]
+        tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
    for index, row in fn.iterrows():
-        fn.loc[index, "rtable_id"] = row["ltable_id"]
+        fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]

    pred_columns = pred.columns.values.tolist()
    l_columns = []
@ -93,6 +93,8 @@ def load_mds(paths: list) -> list:
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
+        if not os.path.exists(md_path):
+            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
@ -137,14 +139,10 @@ def load_data(left_path: str, right_path: str, mapping_path: str):
    return left, right, mapping


-def ml_er(config: Configuration):
+def ml_er(iter_round: int, config: Configuration = None, ):
    # todo:
    #  if config is not None -> load configs
-    #  else use default configs
-    #  1. block_attr
-    #  2. overlap_size
-    #  3. ml_matcher
-    #  4. ml_blocker
+    #  else -> use default configs
    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
    cm.set_key(ltable, ltable_id)
    ltable.fillna("", inplace=True)
@ -168,28 +166,58 @@ def ml_er(config: Configuration):

    selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
    selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
+    tables_id = rtable_id
    selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
    selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
+    items_but_id = selected_attrs[:]
+    items_but_id.remove(tables_id)  # 两张表中除了id的字段名
    attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
    attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
-    cm.set_key(selected_ltable, ltable_id)
-    cm.set_key(selected_rtable, rtable_id)
-
+    cm.set_key(selected_ltable, tables_id)
+    cm.set_key(selected_rtable, tables_id)
+
+    if config is not None:
+        ml_matcher = config["ml_matcher"]
+        if ml_matcher == "dt":
+            matcher = em.DTMatcher(name='DecisionTree', random_state=0)
+        elif ml_matcher == "svm":
+            matcher = em.SVMMatcher(name='SVM', random_state=0)
+        elif ml_matcher == "rf":
+            matcher = em.RFMatcher(name='RF', random_state=0)
+        elif ml_matcher == "lg":
+            matcher = em.LogRegMatcher(name='LogReg', random_state=0)
+        elif ml_matcher == "ln":
+            matcher = em.LinRegMatcher(name='LinReg')
+        elif ml_matcher == "nb":
+            matcher = em.NBMatcher(name='NaiveBayes')
+
+        if config["ml_blocker"] == "over_lap":
+            blocker = em.OverlapBlocker()
+            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
+                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
+                                             overlap_size=config["overlap_size"], show_progress=False)
+        elif config["ml_blocker"] == "attr_equiv":
+            blocker = em.AttrEquivalenceBlocker()
+            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
+                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1)
+    else:
+        matcher = em.RFMatcher(name='RF', random_state=0)
        blocker = em.OverlapBlocker()
-    candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name',
+        candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0],
                                         l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
                                         overlap_size=1, show_progress=False)
+
    candidate['gold'] = 0

    candidate_match_rows = []
    for index, row in candidate.iterrows():
-        l_id = row['ltable_' + ltable_id]
+        l_id = row['ltable_' + tables_id]
        map_row = mappings[mappings[mapping_lid] == l_id]

        if map_row is not None:
            r_id = map_row[mapping_rid]
            for value in r_id:
-                if value == row['rtable_' + rtable_id]:
+                if value == row['rtable_' + tables_id]:
                    candidate_match_rows.append(row["_id"])
        else:
            continue
@ -204,19 +232,18 @@ def ml_er(config: Configuration):
    # 拼接正负样本
    candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
    cm.set_key(candidate_for_train_test, '_id')
-    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
-    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
+    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
+    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
    cm.set_ltable(candidate_for_train_test, selected_ltable)
    cm.set_rtable(candidate_for_train_test, selected_rtable)

    # 分为训练测试集
    train_proportion = 0.7
    test_proportion = 0.3
-    sets = em.split_train_test(candidate_for_train_test, train_proportion=0.7, random_state=0)
+    sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
    train_set = sets['train']
    test_set = sets['test']

-    rf = em.RFMatcher(name='RF', random_state=0)
    feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)

    train_feature_vecs = em.extract_feature_vecs(train_set,
@ -224,20 +251,19 @@ def ml_er(config: Configuration):
                                                 attrs_after=['gold'],
                                                 show_progress=False)

+    test_feature_after = attrs_with_l_prefix[:]
+    test_feature_after.extend(attrs_with_r_prefix)
+    for _ in test_feature_after:
+        if _.endswith(tables_id):
+            test_feature_after.remove(_)
+    test_feature_after.append('gold')
    test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-                                                attrs_after=['ltable_name', 'ltable_description', 'ltable_manufacturer',
-                                                             'ltable_price', 'rtable_name', 'rtable_description',
-                                                             'rtable_manufacturer', 'rtable_price', 'gold'],
-                                                show_progress=False)
+                                                attrs_after=test_feature_after, show_progress=False)

-    rf.fit(table=train_feature_vecs,
-           exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
-           target_attr='gold')
-    predictions = rf.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_name',
-                                                                     'ltable_description', 'ltable_manufacturer',
-                                                                     'ltable_price', 'rtable_name',
-                                                                     'rtable_description',
-                                                                     'rtable_manufacturer', 'rtable_price', 'gold'],
+    fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
+    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
+    test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
+    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
                             append=True, target_attr='predicted', inplace=False)
    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
    em.print_eval_summary(eval_result)
@ -252,12 +278,13 @@ def ml_er(config: Configuration):
    predictions_attrs.extend(['gold', 'predicted'])
    predictions = predictions[predictions_attrs]

+    md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
+                'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
    epl_match = 0  # 可解释，预测match
    nepl_mismatch = 0  # 不可解释，预测mismatch
-    p_md = "/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt"
-    p_vio = "/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt"
-    md_paths: list = [p_md, p_vio]
+
    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
+    if len(md_list) > 0:
        for row in predictions.itertuples():
            if is_explicable(row, md_list):
                if getattr(row, 'predicted') == 1:
@ -266,14 +293,20 @@ def ml_er(config: Configuration):
                if getattr(row, 'predicted') == 0:
                    nepl_mismatch += 1

-    epl_ability = (epl_match + nepl_mismatch) / len(predictions)
+    interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
+    if indicators["my_recall"] >= 0.8:
+        f1 = indicators["F1"]
+    else:
+        f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
+    performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
    ################################################################################################################
    process_prediction_for_md_discovery(predictions)

-    output_path = "output/eval_result" + str(time.time()) + ".txt"
+    output_path = "ml_er/output/eval_result_" + str(iter_round) + ".txt"
    with open(output_path, 'w') as f:
        for key, value in six.iteritems(_get_metric(eval_result)):
            f.write(key + " : " + value)
            f.write('\n')
-        f.write('my_recall:' + str(indicators["my_recall"]))
-        f.write('\n')
+        f.write('my_recall:' + str(indicators["my_recall"]) + '\n')
+        f.write('interpretability:' + str(interpretability) + '\n')
+        f.write('performance:' + str(performance) + '\n')
--- a/settings.py
+++ b/settings.py
@ -0,0 +1,12 @@
+ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
+rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
+mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
+mapping_lid = 'idAmazon'  # mapping表中左表id名
+mapping_rid = 'idGoogleBase'  # mapping表中右表id名
+ltable_id = 'id'  # 左表id字段名称
+rtable_id = 'id'  # 右表id字段名称
+target_attr = 'id'  # 进行md挖掘时的目标字段
+lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
+similarity_threshold = 0.7
+confidence_threshold = 0.8
+interpre_weight = 0.3  # 可解释性权重