From e18295838abf6d5f546ce4ce5c2c319b7130b2e1 Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Mon, 25 Sep 2023 09:10:21 +0800
Subject: [PATCH] 9.24

---
 entrance.py                                   |  37 +++--
 hpo/{magellan_hpo.py => er_model_hpo.py}      | 131 +++++++++--------
 md_discovery/script/md_discover.py            |  14 +-
 ...Goods Dataset.py => ml_entity_resolver.py} | 135 +++++++++++-------
 settings.py                                   |  12 ++
 5 files changed, 192 insertions(+), 137 deletions(-)
 rename hpo/{magellan_hpo.py => er_model_hpo.py} (76%)
 rename ml_er/{Goods Dataset.py => ml_entity_resolver.py} (67%)
 create mode 100644 settings.py

diff --git a/entrance.py b/entrance.py
index a0ac5d6..eca4e6a 100644
--- a/entrance.py
+++ b/entrance.py
@@ -1,29 +1,28 @@
 # this is the entrance of the auto-ER procedure
 from md_discovery.script.md_discover import md_discover
+from ml_er.ml_entity_resolver import ml_er
+from hpo.er_model_hpo import ml_er_hpo
+from settings import *
 
 
-ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
-rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
-mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
-mapping_lid = 'idAmazon'  # mapping表中左表id名
-mapping_rid = 'idGoogleBase'  # mapping表中右表id名
-ltable_id = 'id'  # 左表id字段名称
-rtable_id = 'id'  # 右表id字段名称
-target_attr = 'id'  # 进行md挖掘时的目标字段
-lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
-similarity_threshold = 0.7
-confidence_threshold = 0.8
-interpretability_weight = 0.3
-
-def run(l_table_path, r_table_path, mapping_path):
-    # while The termination condition is not met:
-    while True:
-    #     er()
+def run(rounds: int):
+    hp_config = None
+    # while The termination condition is not met
+    iter_round = 1
+    for i in range(0, rounds):
+        ml_er(iter_round, hp_config)
         md_discover()
-    #     hpo()
+        hp_config = ml_er_hpo()
+        iter_round += 1
+    ml_er(iter_round, hp_config)
     return
 
 
 if __name__ == '__main__':
-    # todo 距离度量用户可设置?
+    path = 'md_discovery/output'
+    # todo
+    #  距离度量用户可设置?
+    #  使用drop删除特征向量中的列？(如删除id相关特征)
+    run(1)
+    # ml_er(1)
     print(ltable_path)
diff --git a/hpo/magellan_hpo.py b/hpo/er_model_hpo.py
similarity index 76%
rename from hpo/magellan_hpo.py
rename to hpo/er_model_hpo.py
index ff5775f..395b499 100644
--- a/hpo/magellan_hpo.py
+++ b/hpo/er_model_hpo.py
@@ -1,4 +1,5 @@
-from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
+import os
+from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
 from ConfigSpace.conditions import InCondition
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
@@ -6,15 +7,13 @@ import pandas as pd
 
 from smac import HyperparameterOptimizationFacade, Scenario
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
-from entrance import *
+from settings import *
 
 # 数据在外部加载
 ########################################################################################################################
 ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
-cm.set_key(ltable, ltable_id)
 ltable.fillna("", inplace=True)
 rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
-cm.set_key(rtable, rtable_id)
 rtable.fillna("", inplace=True)
 mappings = pd.read_csv(mapping_path)
 
@@ -32,21 +31,12 @@ for index, row in mappings.iterrows():
 # 仅保留两表中出现在映射表中的行，增大正样本比例
 selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
 selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
+tables_id = rtable_id  # 不论左表右表ID字段名是否一致，经上一行调整，统一以右表为准
 selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
 selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
-attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
-attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
-cm.set_key(selected_ltable, ltable_id)
-cm.set_key(selected_rtable, rtable_id)
 ########################################################################################################################
 
 
-def test_test():
-    block_attr_items = selected_attrs[:]
-    block_attr_items.remove(rtable_id)
-    print(block_attr_items)
-
-
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
                         test_proportion: float) -> dict:
     new_df = df.reset_index(drop=False, inplace=False)
@@ -82,6 +72,8 @@ def load_mds(paths: list) -> list:
     all_mds = []
     # 传入md路径列表
     for md_path in paths:
+        if not os.path.exists(md_path):
+            continue
         mds = []
         # 打开每一个md文件
         with open(md_path, 'r') as f:
@@ -102,7 +94,7 @@ def is_explicable(row, all_mds: list) -> bool:
         explicable = True  # 假设这条md能解释当前元组
         for a in attrs:
             threshold = md[a]
-            if my_Levenshtein_ratio(str(getattr(row, 'ltable_'+a)), str(getattr(row, 'rtable_'+a))) < threshold:
+            if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold:
                 explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                 break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
         if explicable:
@@ -116,13 +108,12 @@ class Classifier:
         # Build Configuration Space which defines all parameters and their ranges
         cs = ConfigurationSpace(seed=0)
         block_attr_items = selected_attrs[:]
-        block_attr_items.remove(rtable_id)
+        block_attr_items.remove(tables_id)
 
         block_attr = Categorical("block_attr", block_attr_items)
         overlap_size = Integer("overlap_size", (1, 3), default=1)
         ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
         ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
-        # todo 其他可调参数(如feature table删去某列)
 
         use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
         cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
@@ -131,6 +122,11 @@ class Classifier:
 
     # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
     def train(self, config: Configuration, seed: int = 0) -> float:
+
+        attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]  # 字段名加左前缀
+        attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]  # 字段名加右前缀
+        cm.set_key(selected_ltable, tables_id)
+        cm.set_key(selected_rtable, tables_id)
         if config["ml_blocker"] == "over_lap":
             blocker = em.OverlapBlocker()
             candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
@@ -145,13 +141,13 @@ class Classifier:
 
         candidate_match_rows = []
         for index, row in candidate.iterrows():
-            l_id = row['ltable_' + ltable_id]
+            l_id = row['ltable_' + tables_id]
             map_row = mappings[mappings[mapping_lid] == l_id]
 
             if map_row is not None:
                 r_id = map_row[mapping_rid]
                 for value in r_id:
-                    if value == row['rtable_' + rtable_id]:
+                    if value == row['rtable_' + tables_id]:
                         candidate_match_rows.append(row["_id"])
             else:
                 continue
@@ -165,9 +161,12 @@ class Classifier:
             candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
         # 拼接正负样本
         candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
+        if len(candidate_for_train_test) == 0:
+            return 1
+
         cm.set_key(candidate_for_train_test, '_id')
-        cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
-        cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
+        cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
+        cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
         cm.set_ltable(candidate_for_train_test, selected_ltable)
         cm.set_rtable(candidate_for_train_test, selected_rtable)
 
@@ -178,7 +177,18 @@ class Classifier:
         train_set = sets['train']
         test_set = sets['test']
 
-        matcher = None
+        cm.set_key(train_set, '_id')
+        cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
+        cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
+        cm.set_ltable(train_set, selected_ltable)
+        cm.set_rtable(train_set, selected_rtable)
+
+        cm.set_key(test_set, '_id')
+        cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
+        cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
+        cm.set_ltable(test_set, selected_ltable)
+        cm.set_rtable(test_set, selected_rtable)
+
         if config["ml_matcher"] == "dt":
             matcher = em.DTMatcher(name='DecisionTree', random_state=0)
         elif config["ml_matcher"] == "svm":
@@ -198,25 +208,21 @@ class Classifier:
                                                      attrs_after=['gold'],
                                                      show_progress=False)
 
-        # todo 属性名解耦
+        test_feature_after = attrs_with_l_prefix[:]
+        test_feature_after.extend(attrs_with_r_prefix)
+        for _ in test_feature_after:
+            if _.endswith(tables_id):
+                test_feature_after.remove(_)
+        test_feature_after.append('gold')
         test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-                                                    attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer',
-                                                                 'ltable_price', 'rtable_name', 'rtable_description',
-                                                                 'rtable_manufacturer', 'rtable_price', 'gold'], show_progress=False)
-
-        # todo 参数可调 用drop删除特征向量中的列？
-        # 1.exclude_attrs
-        # 去掉id相关的相似度
-        matcher.fit(table=train_feature_vecs,
-               exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
-               target_attr='gold')
-
-        # 1.exclude_attrs
-        predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_title',
-                                                                         'ltable_description', 'ltable_manufacturer',
-                                                                         'ltable_price', 'rtable_name', 'rtable_description',
-                                                                         'rtable_manufacturer', 'rtable_price', 'gold'],
-                                 append=True, target_attr='predicted', inplace=False)
+                                                    attrs_after=test_feature_after, show_progress=False)
+
+        fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
+        matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
+
+        test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
+        predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
+                                      append=True, target_attr='predicted', inplace=False)
         eval_result = em.eval_matches(predictions, 'gold', 'predicted')
         em.print_eval_summary(eval_result)
         indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, test_proportion)
@@ -231,34 +237,42 @@ class Classifier:
 
         #  默认路径为 "../md_discovery/output/xxx.txt"
         #  真阳/假阴  mds/vio  共4个md文件
-        md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt',
-                    '../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt']
+        md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
+                    'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
         epl_match = 0  # 可解释，预测match
         nepl_mismatch = 0  # 不可解释，预测mismatch
         md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
-        for row in predictions.itertuples():
-            if is_explicable(row, md_list):
-                if getattr(row, 'predicted') == 1:
-                    epl_match += 1
-            else:
-                if getattr(row, 'predicted') == 0:
-                    nepl_mismatch += 1
-        epl_ability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
-        f1 = indicators['F1']
-        performance = interpretability_weight * epl_ability + (1 - interpretability_weight) * f1
+        if len(md_list) > 0:
+            for row in predictions.itertuples():
+                if is_explicable(row, md_list):
+                    if getattr(row, 'predicted') == 1:
+                        epl_match += 1
+                else:
+                    if getattr(row, 'predicted') == 0:
+                        nepl_mismatch += 1
+        interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
+        # if indicators["my_recall"] >= 0.8:
+        #     f1 = indicators["F1"]
+        # else:
+        #     f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
+        if indicators["my_recall"] < 0.8:
+            return 1
+        f1 = indicators["F1"]
+        performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
         return 1 - performance
 
 
-if __name__ == "__main__":
+def ml_er_hpo():
     classifier = Classifier()
 
     # Next, we create an object, holding general information about the run
     scenario = Scenario(
         classifier.configspace,
-        n_trials=12,  # We want to run max 50 trials (combination of config and seed)
+        deterministic=True,
+        n_trials=10,  # We want to run max 50 trials (combination of config and seed)
     )
 
-    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=3)
+    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
 
     # Now we use SMAC to find the best hyperparameters
     smac = HyperparameterOptimizationFacade(
@@ -268,9 +282,6 @@ if __name__ == "__main__":
         overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
     )
 
-    # todo
-    #  如果new_recall过低则避免其成为最优解
-    #  将损失函数置为1/用new_recall降低F1从而提高损失函数
     incumbent = smac.optimize()
 
     # Get cost of default configuration
@@ -280,6 +291,6 @@ if __name__ == "__main__":
     # Let's calculate the cost of the incumbent
     incumbent_cost = smac.validate(incumbent)
     print(f"Incumbent cost: {incumbent_cost}")
-
     print(f"Configuration:{incumbent.values()}")
-    print(f"MAX_F1:{1-classifier.train(incumbent)}")
+
+    return incumbent
diff --git a/md_discovery/script/md_discover.py b/md_discovery/script/md_discover.py
index 6a87c9b..c3b9908 100644
--- a/md_discovery/script/md_discover.py
+++ b/md_discovery/script/md_discover.py
@@ -1,7 +1,7 @@
 import time
 from md_discovery.functions.multi_process_infer_by_pairs import inference_from_record_pairs
 from md_discovery.functions.multi_process_infer_by_pairs import get_mds_metadata
-from entrance import *
+from settings import *
 
 # # 若不输出support和confidence，使用以下两块代码
 # # 将列表1写入本地，路径需自己修改
@@ -19,8 +19,8 @@ from entrance import *
 
 def md_discover():
     # 目前可以仿照这个main函数写
-    tp_single_tuple_path = "../../ml_er/output/tp_single_tuple.csv"
-    fn_single_tuple_path = "../../ml_er/output/fn_single_tuple.csv"
+    tp_single_tuple_path = "ml_er/output/tp_single_tuple.csv"
+    fn_single_tuple_path = "ml_er/output/fn_single_tuple.csv"
     # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
     # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
     # 例如此处输入参数要求md左侧相似度字段至少为0.7，右侧指向'id'字段
@@ -36,8 +36,8 @@ def md_discover():
 
     # 若输出support和confidence，使用以下两块代码
     # 将列表1写入本地，路径需自己修改
-    tp_mds_path = "../output/tp_mds.txt"
-    tp_vio_path = "../output/tp_vio.txt"
+    tp_mds_path = "md_discovery/output/tp_mds.txt"
+    tp_vio_path = "md_discovery/output/tp_vio.txt"
     
     with open(tp_mds_path, 'w') as f:
         for _ in tp_mds_meta:
@@ -51,8 +51,8 @@ def md_discover():
                 f.write(i + ':' + str(_[i]) + '\t')
             f.write('\n')
 
-    fn_mds_path = "../output/fn_mds.txt"
-    fn_vio_path = "../output/fn_vio.txt"
+    fn_mds_path = "md_discovery/output/fn_mds.txt"
+    fn_vio_path = "md_discovery/output/fn_vio.txt"
 
     with open(fn_mds_path, 'w') as f:
         for _ in fn_mds_meta:
diff --git a/ml_er/Goods Dataset.py b/ml_er/ml_entity_resolver.py
similarity index 67%
rename from ml_er/Goods Dataset.py
rename to ml_er/ml_entity_resolver.py
index d864bad..1063eca 100644
--- a/ml_er/Goods Dataset.py	
+++ b/ml_er/ml_entity_resolver.py
@@ -1,3 +1,4 @@
+import os
 import sys
 
 from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
@@ -11,19 +12,18 @@ import time
 import six
 from ConfigSpace import Configuration
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
-from entrance import *
-from hpo.magellan_hpo import incumbent
+from settings import *
 
 
-def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "output/tp_single_tuple.csv", fn_single_tuple_path: str = "output/fn_single_tuple.csv"):
+def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "ml_er/output/tp_single_tuple.csv", fn_single_tuple_path: str = "ml_er/output/fn_single_tuple.csv"):
     # 提取预测表中真阳和假阴部分
     tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
     fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
     # 将真阳/假阴表中左右ID调整一致
     for index, row in tp.iterrows():
-        tp.loc[index, "rtable_id"] = row["ltable_id"]
+        tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
     for index, row in fn.iterrows():
-        fn.loc[index, "rtable_id"] = row["ltable_id"]
+        fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
 
     pred_columns = pred.columns.values.tolist()
     l_columns = []
@@ -93,6 +93,8 @@ def load_mds(paths: list) -> list:
     all_mds = []
     # 传入md路径列表
     for md_path in paths:
+        if not os.path.exists(md_path):
+            continue
         mds = []
         # 打开每一个md文件
         with open(md_path, 'r') as f:
@@ -137,14 +139,10 @@ def load_data(left_path: str, right_path: str, mapping_path: str):
     return left, right, mapping
 
 
-def ml_er(config: Configuration):
+def ml_er(iter_round: int, config: Configuration = None, ):
     # todo:
     #  if config is not None -> load configs
-    #  else use default configs
-    #  1. block_attr
-    #  2. overlap_size
-    #  3. ml_matcher
-    #  4. ml_blocker
+    #  else -> use default configs
     ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
     cm.set_key(ltable, ltable_id)
     ltable.fillna("", inplace=True)
@@ -168,28 +166,58 @@ def ml_er(config: Configuration):
 
     selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
     selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
+    tables_id = rtable_id
     selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
     selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
+    items_but_id = selected_attrs[:]
+    items_but_id.remove(tables_id)  # 两张表中除了id的字段名
     attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
     attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
-    cm.set_key(selected_ltable, ltable_id)
-    cm.set_key(selected_rtable, rtable_id)
+    cm.set_key(selected_ltable, tables_id)
+    cm.set_key(selected_rtable, tables_id)
+
+    if config is not None:
+        ml_matcher = config["ml_matcher"]
+        if ml_matcher == "dt":
+            matcher = em.DTMatcher(name='DecisionTree', random_state=0)
+        elif ml_matcher == "svm":
+            matcher = em.SVMMatcher(name='SVM', random_state=0)
+        elif ml_matcher == "rf":
+            matcher = em.RFMatcher(name='RF', random_state=0)
+        elif ml_matcher == "lg":
+            matcher = em.LogRegMatcher(name='LogReg', random_state=0)
+        elif ml_matcher == "ln":
+            matcher = em.LinRegMatcher(name='LinReg')
+        elif ml_matcher == "nb":
+            matcher = em.NBMatcher(name='NaiveBayes')
+
+        if config["ml_blocker"] == "over_lap":
+            blocker = em.OverlapBlocker()
+            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
+                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
+                                             overlap_size=config["overlap_size"], show_progress=False)
+        elif config["ml_blocker"] == "attr_equiv":
+            blocker = em.AttrEquivalenceBlocker()
+            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
+                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1)
+    else:
+        matcher = em.RFMatcher(name='RF', random_state=0)
+        blocker = em.OverlapBlocker()
+        candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0],
+                                         l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
+                                         overlap_size=1, show_progress=False)
 
-    blocker = em.OverlapBlocker()
-    candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name',
-                                     l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-                                     overlap_size=1, show_progress=False)
     candidate['gold'] = 0
 
     candidate_match_rows = []
     for index, row in candidate.iterrows():
-        l_id = row['ltable_' + ltable_id]
+        l_id = row['ltable_' + tables_id]
         map_row = mappings[mappings[mapping_lid] == l_id]
 
         if map_row is not None:
             r_id = map_row[mapping_rid]
             for value in r_id:
-                if value == row['rtable_' + rtable_id]:
+                if value == row['rtable_' + tables_id]:
                     candidate_match_rows.append(row["_id"])
         else:
             continue
@@ -204,19 +232,18 @@ def ml_er(config: Configuration):
     # 拼接正负样本
     candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
     cm.set_key(candidate_for_train_test, '_id')
-    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
-    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
+    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
+    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
     cm.set_ltable(candidate_for_train_test, selected_ltable)
     cm.set_rtable(candidate_for_train_test, selected_rtable)
 
     # 分为训练测试集
     train_proportion = 0.7
     test_proportion = 0.3
-    sets = em.split_train_test(candidate_for_train_test, train_proportion=0.7, random_state=0)
+    sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
     train_set = sets['train']
     test_set = sets['test']
 
-    rf = em.RFMatcher(name='RF', random_state=0)
     feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
 
     train_feature_vecs = em.extract_feature_vecs(train_set,
@@ -224,20 +251,19 @@ def ml_er(config: Configuration):
                                                  attrs_after=['gold'],
                                                  show_progress=False)
 
+    test_feature_after = attrs_with_l_prefix[:]
+    test_feature_after.extend(attrs_with_r_prefix)
+    for _ in test_feature_after:
+        if _.endswith(tables_id):
+            test_feature_after.remove(_)
+    test_feature_after.append('gold')
     test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-                                                attrs_after=['ltable_name', 'ltable_description', 'ltable_manufacturer',
-                                                             'ltable_price', 'rtable_name', 'rtable_description',
-                                                             'rtable_manufacturer', 'rtable_price', 'gold'],
-                                                show_progress=False)
-
-    rf.fit(table=train_feature_vecs,
-           exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
-           target_attr='gold')
-    predictions = rf.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_name',
-                                                                     'ltable_description', 'ltable_manufacturer',
-                                                                     'ltable_price', 'rtable_name',
-                                                                     'rtable_description',
-                                                                     'rtable_manufacturer', 'rtable_price', 'gold'],
+                                                attrs_after=test_feature_after, show_progress=False)
+
+    fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
+    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
+    test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
+    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
                              append=True, target_attr='predicted', inplace=False)
     eval_result = em.eval_matches(predictions, 'gold', 'predicted')
     em.print_eval_summary(eval_result)
@@ -252,28 +278,35 @@ def ml_er(config: Configuration):
     predictions_attrs.extend(['gold', 'predicted'])
     predictions = predictions[predictions_attrs]
 
+    md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
+                'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
     epl_match = 0  # 可解释，预测match
     nepl_mismatch = 0  # 不可解释，预测mismatch
-    p_md = "/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt"
-    p_vio = "/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt"
-    md_paths: list = [p_md, p_vio]
-    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
-    for row in predictions.itertuples():
-        if is_explicable(row, md_list):
-            if getattr(row, 'predicted') == 1:
-                epl_match += 1
-        else:
-            if getattr(row, 'predicted') == 0:
-                nepl_mismatch += 1
 
-    epl_ability = (epl_match + nepl_mismatch) / len(predictions)
+    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
+    if len(md_list) > 0:
+        for row in predictions.itertuples():
+            if is_explicable(row, md_list):
+                if getattr(row, 'predicted') == 1:
+                    epl_match += 1
+            else:
+                if getattr(row, 'predicted') == 0:
+                    nepl_mismatch += 1
+
+    interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
+    if indicators["my_recall"] >= 0.8:
+        f1 = indicators["F1"]
+    else:
+        f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
+    performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
     ################################################################################################################
     process_prediction_for_md_discovery(predictions)
 
-    output_path = "output/eval_result" + str(time.time()) + ".txt"
+    output_path = "ml_er/output/eval_result_" + str(iter_round) + ".txt"
     with open(output_path, 'w') as f:
         for key, value in six.iteritems(_get_metric(eval_result)):
             f.write(key + " : " + value)
             f.write('\n')
-        f.write('my_recall:' + str(indicators["my_recall"]))
-        f.write('\n')
+        f.write('my_recall:' + str(indicators["my_recall"]) + '\n')
+        f.write('interpretability:' + str(interpretability) + '\n')
+        f.write('performance:' + str(performance) + '\n')
diff --git a/settings.py b/settings.py
new file mode 100644
index 0000000..905e567
--- /dev/null
+++ b/settings.py
@@ -0,0 +1,12 @@
+ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
+rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
+mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
+mapping_lid = 'idAmazon'  # mapping表中左表id名
+mapping_rid = 'idGoogleBase'  # mapping表中右表id名
+ltable_id = 'id'  # 左表id字段名称
+rtable_id = 'id'  # 右表id字段名称
+target_attr = 'id'  # 进行md挖掘时的目标字段
+lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
+similarity_threshold = 0.7
+confidence_threshold = 0.8
+interpre_weight = 0.3  # 可解释性权重