From b1c0abb6649001ec00a367bd6e04beed7834d764 Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Tue, 26 Sep 2023 22:18:13 +0800
Subject: [PATCH] =?UTF-8?q?bug=E4=BF=AE=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 entrance.py         |  2 ++
 hpo/er_model_hpo.py | 40 ++++++++++++++++++++++------------------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/entrance.py b/entrance.py
index eca4e6a..d51bf8a 100644
--- a/entrance.py
+++ b/entrance.py
@@ -25,4 +25,6 @@ if __name__ == '__main__':
     #  使用drop删除特征向量中的列？(如删除id相关特征)
     run(1)
     # ml_er(1)
+    # todo 将优化结果与参数输出到文件中
+    #  通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息
     print(ltable_path)
diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py
index 395b499..81c1c76 100644
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@@ -1,4 +1,6 @@
 import os
+import time
+
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
 from ConfigSpace.conditions import InCondition
 import py_entitymatching as em
@@ -37,7 +39,7 @@ selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字
 ########################################################################################################################
 
 
-def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
+def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int,
                         test_proportion: float) -> dict:
     new_df = df.reset_index(drop=False, inplace=False)
     gold = new_df[labeled_attr]
@@ -61,7 +63,7 @@ def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str
     precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
     recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
     F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
-    my_recall = num_true_positives / (matching_number * test_proportion)
+    my_recall = num_true_positives / (couple_number * test_proportion)
 
     return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall}
 
@@ -88,13 +90,13 @@ def load_mds(paths: list) -> list:
     return all_mds
 
 
-def is_explicable(row, all_mds: list) -> bool:
+def is_explicable(line, all_mds: list) -> bool:
     attrs = all_mds[0].keys()  # 从第一条md中读取所有字段
     for md in all_mds:
         explicable = True  # 假设这条md能解释当前元组
         for a in attrs:
             threshold = md[a]
-            if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold:
+            if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold:
                 explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                 break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
         if explicable:
@@ -122,7 +124,8 @@ class Classifier:
 
     # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
     def train(self, config: Configuration, seed: int = 0) -> float:
-
+        # print(f"BRAINFUCK:{config.values()}")
+        cm.del_catalog()
         attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]  # 字段名加左前缀
         attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]  # 字段名加右前缀
         cm.set_key(selected_ltable, tables_id)
@@ -131,28 +134,28 @@ class Classifier:
             blocker = em.OverlapBlocker()
             candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
                                              l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-                                             overlap_size=config["overlap_size"], show_progress=False, n_jobs=-1)
+                                             overlap_size=config["overlap_size"], show_progress=False)
         elif config["ml_blocker"] == "attr_equiv":
             blocker = em.AttrEquivalenceBlocker()
             candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
-                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1)
+                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs)
 
         candidate['gold'] = 0
 
         candidate_match_rows = []
-        for index, row in candidate.iterrows():
-            l_id = row['ltable_' + tables_id]
+        for index_num, line in candidate.iterrows():
+            l_id = line['ltable_' + tables_id]
             map_row = mappings[mappings[mapping_lid] == l_id]
 
             if map_row is not None:
                 r_id = map_row[mapping_rid]
                 for value in r_id:
-                    if value == row['rtable_' + tables_id]:
-                        candidate_match_rows.append(row["_id"])
+                    if value == line['rtable_' + tables_id]:
+                        candidate_match_rows.append(line["_id"])
             else:
                 continue
-        for row in candidate_match_rows:
-            candidate.loc[row, 'gold'] = 1
+        for line in candidate_match_rows:
+            candidate.loc[line, 'gold'] = 1
 
         # 裁剪负样本，保持正负样本数量一致
         candidate_mismatch = candidate[candidate['gold'] == 0]
@@ -243,12 +246,12 @@ class Classifier:
         nepl_mismatch = 0  # 不可解释，预测mismatch
         md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
         if len(md_list) > 0:
-            for row in predictions.itertuples():
-                if is_explicable(row, md_list):
-                    if getattr(row, 'predicted') == 1:
+            for line in predictions.itertuples():
+                if is_explicable(line, md_list):
+                    if getattr(line, 'predicted') == 1:
                         epl_match += 1
                 else:
-                    if getattr(row, 'predicted') == 0:
+                    if getattr(line, 'predicted') == 0:
                         nepl_mismatch += 1
         interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
         # if indicators["my_recall"] >= 0.8:
@@ -270,6 +273,7 @@ def ml_er_hpo():
         classifier.configspace,
         deterministic=True,
         n_trials=10,  # We want to run max 50 trials (combination of config and seed)
+        n_workers=2
     )
 
     initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
@@ -291,6 +295,6 @@ def ml_er_hpo():
     # Let's calculate the cost of the incumbent
     incumbent_cost = smac.validate(incumbent)
     print(f"Incumbent cost: {incumbent_cost}")
-    print(f"Configuration:{incumbent.values()}")
+    print(f"Optimized_Configuration:{incumbent.values()}")
 
     return incumbent