From 882c25d20fdd7b7fd463ed79f376e3e46c1a90fa Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Sat, 28 Oct 2023 18:09:59 +0800
Subject: [PATCH] =?UTF-8?q?1.=E6=89=80=E6=9C=89=E7=9B=B8=E4=BC=BC=E5=BA=A6?=
 =?UTF-8?q?=E8=AE=A1=E7=AE=97=E4=BE=9D=E9=9D=A0GPU=202.=E8=BF=AD=E4=BB=A3?=
 =?UTF-8?q?=E8=BD=AE=E6=95=B0=E8=87=AA=E5=8A=A8=E8=AF=86=E5=88=AB=203.?=
 =?UTF-8?q?=E8=B6=85=E5=8F=82=E6=95=B0=E4=BC=98=E5=8C=96=E7=BB=93=E6=9E=9C?=
 =?UTF-8?q?=E8=90=BD=E7=9B=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                   |   3 +
 hpo/er_model_hpo.py                          |  43 ++++--
 md_discovery/md_discover.py                  |  55 ++-----
 md_discovery/multi_process_infer_by_pairs.py |  18 +--
 md_discovery/tmp_discover.py                 | 106 ++-----------
 ml_er/ml_entity_resolver.py                  | 149 +++++++++++--------
 settings.py                                  |   3 +-
 tfile.py                                     |  64 +++++++-
 8 files changed, 212 insertions(+), 229 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6c7c123..a9db128 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 /deprecated/
 /datasets/
+/ml_er/output/*
+/md_discovery/output/*
+/hpo/output/*
diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py
index e44b9d8..e3937cd 100644
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@@ -1,14 +1,18 @@
 import os
 
+import numpy as np
+import torch
+import json
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
 from ConfigSpace.conditions import InCondition
+from ConfigSpace.read_and_write import json as csj
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
 
 from smac import HyperparameterOptimizationFacade, Scenario
 from settings import *
-from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable
+from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable, build_col_pairs_sim_tensor_dict
 
 # 数据在外部加载
 ########################################################################################################################
@@ -171,23 +175,23 @@ class Classifier:
         predictions_attrs.extend(attrs_with_r_prefix)
         predictions_attrs.extend(['gold', 'predicted'])
         predictions = predictions[predictions_attrs]
+        predictions = predictions.reset_index(drop=True)
+        predictions = predictions.astype(str)
+        sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
 
         #  默认路径为 "../md_discovery/output/xxx.txt"
-        #  真阳/假阴  mds/vio  共4个md文件
-        md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt',
-                    md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt']
-        epl_match = 0  # 可解释，预测match
-        nepl_mismatch = 0  # 不可解释，预测mismatch
+        #  mds/vio  共2个md文件
+        md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
         md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
+        epl_match = 0  # 可解释，预测match
         if len(md_list) > 0:
             for line in predictions.itertuples():
-                if is_explicable(line, md_list):
-                    if getattr(line, 'predicted') == 1:
-                        epl_match += 1
-                else:
-                    if getattr(line, 'predicted') == 0:
-                        nepl_mismatch += 1
-        interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
+                if is_explicable(line, md_list, sim_tensor_dict) and str(getattr(line, 'predicted')) == str(1):
+                    epl_match += 1
+
+        ppre = predictions[predictions['predicted'] == str(1)]
+        interpretability = epl_match / len(ppre)  # 可解释性
+        # todo block_recall可以考虑以下注释
         # if indicators["block_recall"] >= 0.8:
         #     f1 = indicators["F1"]
         # else:
@@ -201,10 +205,15 @@ class Classifier:
 
 def ml_er_hpo():
     classifier = Classifier()
+    cs = classifier.configspace
+    str_configspace = csj.write(cs)
+    dict_configspace = json.loads(str_configspace)
+    with open(hpo_output_dir + "configspace.json", "w") as f:
+        json.dump(dict_configspace, f)
 
     # Next, we create an object, holding general information about the run
     scenario = Scenario(
-        classifier.configspace,
+        cs,
         deterministic=True,
         n_trials=10,  # We want to run max 50 trials (combination of config and seed)
         n_workers=1
@@ -221,9 +230,11 @@ def ml_er_hpo():
     )
 
     incumbent = smac.optimize()
+    incumbent_ndarray = incumbent.get_array()
+    np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray)
 
     # Get cost of default configuration
-    default_cost = smac.validate(classifier.configspace.get_default_configuration())
+    default_cost = smac.validate(cs.get_default_configuration())
     print(f"Default cost: {default_cost}")
 
     # Let's calculate the cost of the incumbent
@@ -235,4 +246,4 @@ def ml_er_hpo():
 
 
 if __name__ == '__main__':
-    print(1)
+    ml_er_hpo()
diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py
index 043bcdd..876513a 100644
--- a/md_discovery/md_discover.py
+++ b/md_discovery/md_discover.py
@@ -1,5 +1,4 @@
-from md_discovery.multi_process_infer_by_pairs import inference_from_record_pairs
-from md_discovery.multi_process_infer_by_pairs import get_mds_metadata
+from md_discovery import tmp_discover
 from settings import *
 
 # # 若不输出support和confidence，使用以下两块代码
@@ -18,51 +17,25 @@ from settings import *
 
 def md_discover():
     # 目前可以仿照这个main函数写
-    tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv"
-    fn_single_tuple_path = er_output_dir + "fn_single_tuple.csv"
+    t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
     # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
-    # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
-    # 例如此处输入参数要求md左侧相似度字段至少为0.7，右侧指向'id'字段
-    tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
-    fn_mds, fn_vio = inference_from_record_pairs(fn_single_tuple_path, similarity_threshold, target_attr)
+    # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值
+    mds_list, vio_list = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr)
 
-    # 如果不需要输出support和confidence，去掉下面两行
-    tp_mds_meta = get_mds_metadata(tp_mds, tp_single_tuple_path, target_attr)
-    tp_vio_meta = get_mds_metadata(tp_vio, tp_single_tuple_path, target_attr)
-
-    fn_mds_meta = get_mds_metadata(fn_mds, fn_single_tuple_path, target_attr)
-    fn_vio_meta = get_mds_metadata(fn_vio, fn_single_tuple_path, target_attr)
-
-    # 若输出support和confidence，使用以下两块代码
     # 将列表1写入本地，路径需自己修改
-    tp_mds_path = md_output_dir + "tp_mds.txt"
-    tp_vio_path = md_output_dir + "tp_vio.txt"
+    mds_path = md_output_dir + "mds.txt"
+    vio_path = md_output_dir + "vio.txt"
     
-    with open(tp_mds_path, 'w') as f:
-        for _ in tp_mds_meta:
-            for i in _.keys():
-                f.write(i + ':' + str(_[i]) + '\t')
-            f.write('\n')
-
-    with open(tp_vio_path, 'w') as f:
-        for _ in tp_vio_meta:
-            for i in _.keys():
-                f.write(i + ':' + str(_[i]) + '\t')
-            f.write('\n')
-
-    fn_mds_path = md_output_dir + "fn_mds.txt"
-    fn_vio_path = md_output_dir + "fn_vio.txt"
-
-    with open(fn_mds_path, 'w') as f:
-        for _ in fn_mds_meta:
-            for i in _.keys():
-                f.write(i + ':' + str(_[i]) + '\t')
+    with open(mds_path, 'w') as f:
+        for _ in mds_list:
+            f.write('Target:'+str(target_attr) + '\t')
+            f.write(str(_))
             f.write('\n')
 
-    with open(fn_vio_path, 'w') as f:
-        for _ in fn_vio_meta:
-            for i in _.keys():
-                f.write(i + ':' + str(_[i]) + '\t')
+    with open(vio_path, 'w') as f:
+        for _ in vio_list:
+            f.write('Target:'+str(target_attr) + '\t')
+            f.write(str(_))
             f.write('\n')
 
 
diff --git a/md_discovery/multi_process_infer_by_pairs.py b/md_discovery/multi_process_infer_by_pairs.py
index d89e51f..319c24d 100644
--- a/md_discovery/multi_process_infer_by_pairs.py
+++ b/md_discovery/multi_process_infer_by_pairs.py
@@ -7,7 +7,7 @@ import time
 import torch
 from tqdm import tqdm
 from transformers import AutoTokenizer, AutoModel
-from settings import model, embedding_dict, er_output_dir
+from settings import model, er_output_dir
 from sentence_transformers.util import cos_sim
 
 conf_thresh = 0.8
@@ -91,7 +91,7 @@ def test_load():
 #     # print(sim.tolist()[0][0]/2 + 0.5)
 
 
-def if_minimal(md, md_list, target_col):
+def is_minimal(md, md_list, target_col):
     # 假设这个md是minimal
     minimal = True
     if len(md_list) == 0:
@@ -153,7 +153,7 @@ def inference_from_record_pairs(path, threshold, target_col):
             # sims是两行的相似度
             sims = {}
             for col in columns:
-                similarity = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)])
+                similarity = norm_cos_sim(getattr(row1, col), getattr(row2, col))
                 sims[col] = similarity
 
             # 寻找violated md,从md列表中删除并加入vio列表
@@ -178,7 +178,7 @@ def inference_from_record_pairs(path, threshold, target_col):
                 #     new_rhs = sims[target_col]
                 #     spec_r_md = copy.deepcopy(vio_md)
                 #     spec_r_md[target_col] = new_rhs
-                #     if if_minimal(spec_r_md, md_list, target_col):
+                #     if is_minimal(spec_r_md, md_list, target_col):
                 #         md_list.append(spec_r_md)
 
                 # 特殊化左侧
@@ -186,11 +186,11 @@ def inference_from_record_pairs(path, threshold, target_col):
                     if sims[col] + 0.01 <= 1:
                         spec_l_md = copy.deepcopy(vio_md)
                         spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
-                        if if_minimal(spec_l_md, md_list, target_col):
+                        if is_minimal(spec_l_md, md_list, target_col):
                             md_list.append(spec_l_md)
 
             # for vio in minimal_vio[:]:
-            #     if not if_minimal(vio, md_list, target_col):
+            #     if not is_minimal(vio, md_list, target_col):
             #         minimal_vio.remove(vio)
 
     # fuck = len(minimal_vio)
@@ -216,11 +216,11 @@ def inference_from_record_pairs(path, threshold, target_col):
     #     minimal_vio = list(proxy_minimal_vio)
     #
     # for _ in minimal_vio[:]:
-    #     if not if_minimal(_, minimal_vio, target_col):
+    #     if not is_minimal(_, minimal_vio, target_col):
     #         minimal_vio.remove(_)
     #
     for _ in md_list[:]:
-        if not if_minimal(_, md_list, target_col):
+        if not is_minimal(_, md_list, target_col):
             md_list.remove(_)
 
     return md_list, minimal_vio
@@ -258,7 +258,7 @@ def get_one_md_metadata(md, dataframe, target_col):
             left_satisfy = True
             both_satisfy = True
             for col in columns:
-                sim = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)])
+                sim = norm_cos_sim(getattr(row1, col), getattr(row2, col))
                 if col == target_col:
                     if sim + 0.0000001 < 1:
                         both_satisfy = False
diff --git a/md_discovery/tmp_discover.py b/md_discovery/tmp_discover.py
index b75ee07..4465e6e 100644
--- a/md_discovery/tmp_discover.py
+++ b/md_discovery/tmp_discover.py
@@ -4,27 +4,18 @@ from concurrent.futures import ProcessPoolExecutor
 from multiprocessing.managers import SharedMemoryManager
 
 import numpy as np
-import pandas
 import pandas as pd
-import Levenshtein
 import copy
 
 import torch
 from tqdm import tqdm
 
-from md_discovery.multi_process_infer_by_pairs import norm_cos_sim
-from settings import embedding_dict, model
+from settings import model, md_output_dir
 
 conf_thresh = 0.8
 
 
-def my_Levenshtein_ratio(str1, str2):
-    if max(len(str1), len(str2)) == 0:
-        return 1
-    return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
-
-
-def if_minimal(md, md_list, target_col):
+def is_minimal(md, md_list, target_col):
     # 假设这个md是minimal
     if len(md_list) == 0:
         return True
@@ -49,23 +40,7 @@ def if_minimal(md, md_list, target_col):
     return minimal
 
 
-def remove_by_confidence(md, md_list, relation, sim_tensor, target_col, lock):
-    support, confidence = get_one_md_metadata(md, relation, sim_tensor, target_col)
-    if confidence < 0.8:
-        with lock:
-            md_list.remove(md)
-
-
-# def remove_by_confidence(md, l, relation, target_col):
-#     boolean, conf = satisfy_confidence(md, relation, 0.8, target_col)
-#     if not boolean:
-#         l.remove(md)
-#         print(md, '\t', conf)
-
-# def build_sim_matrix():
-#     width
-#     return 0
-def inference_from_record_pairs(path, threshold, target_col):
+def pairs_inference(path, threshold, target_col):
     data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
     data.fillna("", inplace=True)
     data = data.astype(str)
@@ -87,7 +62,7 @@ def inference_from_record_pairs(path, threshold, target_col):
     sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
     sim_tensor = sim_tensor/2 + 0.5
 
-    torch.save(sim_tensor, "E:\\Data\\Research\\Projects\\matching_dependency\\tensor.pt")
+    torch.save(sim_tensor, md_output_dir + "tensor.pt")
 
     md_list = []
     minimal_vio = []
@@ -108,11 +83,10 @@ def inference_from_record_pairs(path, threshold, target_col):
                 sims[col] = similarity
 
             # 寻找violated md,从md列表中删除并加入vio列表
-            # tmp_md_list = copy.deepcopy(md_list)
             for md in md_list[:]:
                 lhs_satis = True
                 rhs_satis = True
-                for col in list(set(columns) - {target_col}):
+                for col in cols_but_target:
                     if sims[col] < md[col]:
                         lhs_satis = False
                         break
@@ -123,32 +97,23 @@ def inference_from_record_pairs(path, threshold, target_col):
                     violated_mds.append(md)
 
             for vio_md in violated_mds:
-                # 特殊化右侧,我们需要右侧百分百相似，其实不需要降低右侧阈值
-                # if sims[target_col] >= threshold:
-                #     new_rhs = sims[target_col]
-                #     spec_r_md = copy.deepcopy(vio_md)
-                #     spec_r_md[target_col] = new_rhs
-                #     if if_minimal(spec_r_md, md_list, target_col):
-                #         md_list.append(spec_r_md)
                 # 特殊化左侧
-                for col in list(set(columns) - {target_col}):
+                for col in cols_but_target:
                     if sims[col] + 0.01 <= 1:
                         spec_l_md = copy.deepcopy(vio_md)
                         spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
-                        if if_minimal(spec_l_md, md_list, target_col):
+                        if is_minimal(spec_l_md, md_list, target_col):
                             md_list.append(spec_l_md)
                 if vio_md not in minimal_vio:
                     minimal_vio.append(vio_md)
             if len(md_list) == 0:
                 terminate = True
                 break
-
-            # tmp_minimal_vio = copy.deepcopy(minimal_vio)
         if terminate:
             break
     if len(md_list) > 0:
         for vio in minimal_vio[:]:
-            if not if_minimal(vio, md_list, target_col):
+            if not is_minimal(vio, md_list, target_col):
                 minimal_vio.remove(vio)
 
     print('mds_list\t', len(md_list), '\n')
@@ -157,64 +122,23 @@ def inference_from_record_pairs(path, threshold, target_col):
     if len(minimal_vio) == 0:
         return md_list, []
 
-    # manager = multiprocessing.Manager()
-    # lock = manager.Lock()
-    # pool_size = 4
-    # pool = multiprocessing.Pool(pool_size)
-    # with manager:
-    #     proxy_minimal_vio = manager.list(minimal_vio)
-    #     for _ in minimal_vio[:]:
-    #         pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, sim_tensor, target_col, lock))
-    #     pool.close()
-    #     pool.join()
-    #     minimal_vio = list(proxy_minimal_vio)
-
-    # minimal_vio.reverse()
-    i = 0
     remove_list = []
-    fuck = []
+    # fuck = []
     for md in minimal_vio:
         support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
-        fuck.append((support, confidence))
+        # fuck.append((support, confidence))
         if support < 1:
             print('delete by support')
             remove_list.append(md)
-        if confidence < 0.8:
+        if confidence < 0.5:
             print('delete by confidence')
             remove_list.append(md)
-    fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
-    # while i < len(minimal_vio):
-    #     print('vio_index\t', i)
-    #     print('vio_length', len(minimal_vio))
-    #     current_md = minimal_vio[i]
-    #     support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index)
-    #     # if support < 50:
-    #     #     minimal_vio_length = len(minimal_vio)
-    #     #     j = i + 1
-    #     #     while j < len(minimal_vio):
-    #     #         specialization = True
-    #     #         next_md = minimal_vio[j]
-    #     #         for col in cols_but_target:
-    #     #             if current_md[col] > next_md[col]:
-    #     #                 specialization = False
-    #     #                 break
-    #     #         if specialization:
-    #     #             minimal_vio.remove(next_md)
-    #     #         else:
-    #     #             j += 1
-    #     #     print('sup')
-    #     #     minimal_vio.remove(current_md)
-    #     if support < 1:
-    #         print('delete by support')
-    #         minimal_vio.remove(current_md)
-    #     if confidence < 0.8:
-    #         print('delete by confidence')
-    #         minimal_vio.remove(current_md)
-    #     if support >= 1 and confidence >= 0.8:
-    #         i += 1
+    # fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
+    for _ in remove_list:
+        minimal_vio.remove(_)
 
     for _ in minimal_vio[:]:
-        if not if_minimal(_, minimal_vio, target_col):
+        if not is_minimal(_, minimal_vio, target_col):
             minimal_vio.remove(_)
 
     print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py
index 1ac0e5e..c5427f3 100644
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@@ -1,33 +1,35 @@
+import json
 import os
 import sys
 
+import ConfigSpace
+import pandas
+import torch
 from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
-
+from ConfigSpace.read_and_write import json as csj
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
 import six
 from ConfigSpace import Configuration
-from md_discovery.multi_process_infer_by_pairs import my_Levenshtein_ratio, norm_cos_sim
 from settings import *
 
 
 def process_prediction_for_md_discovery(pred: pd.DataFrame,
-                                        tp_single_tuple_path: str = er_output_dir + "tp_single_tuple.csv",
-                                        fn_single_tuple_path: str = er_output_dir + "fn_single_tuple.csv"):
+                                        t_single_tuple_path: str = er_output_dir + "t_single_tuple.csv"):
     # 提取预测表中真阳和假阴部分
     tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
     fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
+    # 拼成一张表
+    df = pd.concat([tp, fn])
     # 将真阳/假阴表中左右ID调整一致
-    for index, row in tp.iterrows():
-        tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
-    for index, row in fn.iterrows():
-        fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
+    for index, row in df.iterrows():
+        df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
 
     pred_columns = pred.columns.values.tolist()
     l_columns = []
     r_columns = []
-    columns = []
+    cols = []
     # 将预测表中左表和右表字段名分别加入两个列表
     for _ in pred_columns:
         if _.startswith('ltable'):
@@ -36,25 +38,15 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame,
             r_columns.append(_)
     # 将左表中字段名去掉前缀，作为统一的字段名列表(前提是两张表内对应字段名调整一致)
     for _ in l_columns:
-        columns.append(_.replace('ltable_', ''))
-
-    # 将表拆分成左右两部分
-    tpl = tp[l_columns]
-    tpr = tp[r_columns]
-    # 将左右两部分字段名统一
-    tpl.columns = columns
-    tpr.columns = columns
+        cols.append(_.replace('ltable_', ''))
 
-    fnl = fn[l_columns]
-    fnr = fn[r_columns]
-    fnl.columns = columns
-    fnr.columns = columns
+    ldf = df[l_columns]
+    rdf = df[r_columns]
+    ldf.columns = cols
+    rdf.columns = cols
+    t_single_tuple = pd.concat([ldf, rdf])
 
-    tp_single_tuple = pd.concat([tpl, tpr])
-    fn_single_tuple = pd.concat([fnl, fnr])
-
-    tp_single_tuple.to_csv(tp_single_tuple_path, sep=',', index=False, header=True)
-    fn_single_tuple.to_csv(fn_single_tuple_path, sep=',', index=False, header=True)
+    t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True)
 
 
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@@ -100,43 +92,57 @@ def load_mds(paths: list) -> list:
             # 读取每一行的md，加入该文件的md列表
             for line in f.readlines():
                 md_metadata = line.strip().split('\t')
-                md = eval(md_metadata[0].replace('md:', ''))
-                confidence = eval(md_metadata[2].replace('confidence:', ''))
-                if confidence > 0:
-                    mds.append(md)
+                # todo 如果MD文件的形式改了 这里也要改
+                md = eval(md_metadata[1])
+                mds.append(md)
         all_mds.extend(mds)
     return all_mds
 
 
-def is_explicable(row, all_mds: list) -> bool:
+def is_explicable(row, all_mds: list, st_dict) -> bool:
     attrs = all_mds[0].keys()  # 从第一条md中读取所有字段
     for md in all_mds:
         explicable = True  # 假设这条md能解释当前元组
         for a in attrs:
-            threshold = md[a]
-            if norm_cos_sim(embedding_dict[str(getattr(row, 'ltable_'+a))],
-                            embedding_dict[str(getattr(row, 'rtable_'+a))]) < threshold:
-                explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
-                break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
+            if a != target_attr:
+                if st_dict[a][row[0]].item() < md[a]:
+                    explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
+                    break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
         if explicable:
             return True  # 任意一条md能解释，直接返回
     return False  # 遍历结束，不能解释
 
 
-def load_data(left_path: str, right_path: str, mapping_path: str):
-    left = pd.read_csv(left_path, encoding='ISO-8859-1')
-    cm.set_key(left, left.columns.values.tolist()[0])
-    left.fillna("", inplace=True)
-    left = left.astype(str)
-
-    right = pd.read_csv(right_path, encoding='ISO-8859-1')
-    cm.set_key(right, right.columns.values.tolist()[0])
-    right.fillna("", inplace=True)
-    right = right.astype(str)
-
-    mapping = pd.read_csv(mapping_path)
-    mapping = mapping.astype(str)
-    return left, right, mapping
+def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
+    predictions_attrs = predictions.columns.values.tolist()
+    col_tuple_list = []
+    for _ in predictions_attrs:
+        if _.startswith('ltable'):
+            left_index = predictions_attrs.index(_)
+            right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
+            col_tuple_list.append((left_index, right_index))
+
+    length = predictions.shape[0]
+    width = predictions.shape[1]
+    sentences = []
+    for col in range(0, width):
+        for row in range(0, length):
+            cell_value = predictions.values[row, col]
+            sentences.append(cell_value)
+    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
+    split_embedding = torch.split(embedding, length, dim=0)
+    table_tensor = torch.stack(split_embedding, dim=0, out=None)
+    # prediction的归一化嵌入张量
+    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
+    sim_tensor_dict = {}
+    for col_tuple in col_tuple_list:
+        lattr_tensor = norm_table_tensor[col_tuple[0]]
+        rattr_tensor = norm_table_tensor[col_tuple[1]]
+        mul_tensor = lattr_tensor * rattr_tensor
+        sim_tensor = torch.sum(mul_tensor, 1)
+        sim_tensor = sim_tensor / 2 + 0.5
+        sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
+    return sim_tensor_dict
 
 
 def ml_er(iter_round: int, config: Configuration = None, ):
@@ -277,30 +283,27 @@ def ml_er(iter_round: int, config: Configuration = None, ):
     predictions_attrs.extend(attrs_with_r_prefix)
     predictions_attrs.extend(['gold', 'predicted'])
     predictions = predictions[predictions_attrs]
+    process_prediction_for_md_discovery(predictions)
+    predictions = predictions.reset_index(drop=True)
+    predictions = predictions.astype(str)
+    sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
 
-    md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt',
-                md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt']
-    epl_match = 0  # 可解释，预测match
-    nepl_mismatch = 0  # 不可解释，预测mismatch
-
+    md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
     md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
+    epl_match = 0  # 可解释，预测match
     if len(md_list) > 0:
         for row in predictions.itertuples():
-            if is_explicable(row, md_list):
-                if getattr(row, 'predicted') == 1:
-                    epl_match += 1
-            else:
-                if getattr(row, 'predicted') == 0:
-                    nepl_mismatch += 1
-
-    interpretability = (epl_match + nepl_mismatch) / len(predictions)  # 可解释性
+            if is_explicable(row, md_list, sim_tensor_dict) and str(getattr(row, 'predicted')) == str(1):
+                epl_match += 1
+
+    df = predictions[predictions['predicted'] == str(1)]
+    interpretability = epl_match / len(df)  # 可解释性
     if indicators["block_recall"] >= 0.8:
         f1 = indicators["F1"]
     else:
         f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
     performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
     ################################################################################################################
-    process_prediction_for_md_discovery(predictions)
 
     output_path = er_output_dir + "eval_result_" + str(iter_round) + ".txt"
     with open(output_path, 'w') as f:
@@ -313,4 +316,20 @@ def ml_er(iter_round: int, config: Configuration = None, ):
 
 
 if __name__ == '__main__':
-    ml_er(1)
+    iterations = 1
+    filename_list = os.listdir(er_output_dir)
+    if len(filename_list) > 0:
+        for _ in filename_list:
+            if _.startswith('eval_result'):
+                iterations = int(_[12:13]) + 1
+
+    if iterations > 1:
+        incumbent_array = np.load(hpo_output_dir + 'incumbent.npy')
+        with open(hpo_output_dir + "configspace.json", 'r') as f:
+            dict_configspace = json.load(f)
+        str_configspace = json.dumps(dict_configspace)
+        configspace = csj.read(str_configspace)
+        configuration = ConfigSpace.Configuration(configspace, vector=incumbent_array)
+        ml_er(iterations, configuration)
+    else:
+        ml_er(1)
diff --git a/settings.py b/settings.py
index fbcabe8..a275b11 100644
--- a/settings.py
+++ b/settings.py
@@ -16,6 +16,5 @@ confidence_threshold = 0.8
 interpre_weight = 0.3  # 可解释性权重
 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
 md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
+hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
 model = SentenceTransformer('E:\\Data\\Research\\Models\\paraphrase-MiniLM-L6-v2')
-embedding_dict = np.load('E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\embedding_dic.npy',
-                         allow_pickle=True).item()
diff --git a/tfile.py b/tfile.py
index d85cc0c..f6b65a6 100644
--- a/tfile.py
+++ b/tfile.py
@@ -1,13 +1,16 @@
+import json
 import multiprocessing
+import os
 import time
+
+import ConfigSpace
 import numpy as np
 import pandas as pd
 import torch
 from tqdm import tqdm
-
-from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs
+from ConfigSpace.read_and_write import json as csj
 from md_discovery import tmp_discover
-from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict
+from settings import er_output_dir, similarity_threshold, target_attr, hpo_output_dir
 
 
 def fuck(i):
@@ -64,9 +67,60 @@ def test4():
     print(torch.count_nonzero(one_bool_tensor).item())
 
 
+def test5():
+    ten1 = torch.tensor([[1, 2, 3],
+                         [7, 8, 9]])
+    ten2 = torch.tensor([[4, 5, 6],
+                         [11, 12, 15]])
+    result = ten1 * ten2
+    r = torch.sum(result, 1)
+    print('\n')
+    print(result)
+    print(r)
+
+
+def test6():
+    table_tensor = torch.tensor([[[1., 2., 3.],
+                                  [4., 5., 6.],
+                                  [7., 8., 9.]],
+                                 [[1., 2., 3.],
+                                  [4., 5., 6.],
+                                  [7., 8., 9.]]])
+    t = torch.tensor([[1., 2., 3.],
+                      [4., 5., 6.]])
+    norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
+    norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
+    print('\n')
+    print(norm1)
+    print(norm2)
+    print(t.shape)
+
+
+def test7():
+    iterations = 1
+    filename_list = os.listdir(er_output_dir)
+    if len(filename_list) > 0:
+        for _ in filename_list:
+            if _.startswith('eval_result'):
+                iterations = int(_[12:13]) + 1
+    print(iterations)
+
+
+def test8():
+    cum = np.load(hpo_output_dir + 'incumbent.npy')
+    with open(hpo_output_dir + "configspace.json", 'r') as load_f:
+        dict_configspace = json.load(load_f)
+    str_configspace = json.dumps(dict_configspace)
+    configspace = csj.read(str_configspace)
+    config = ConfigSpace.Configuration(configspace, vector=cum)
+    print(cum)
+
+
 if __name__ == '__main__':
     start = time.time()
-    tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv"
+    t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
     # tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
-    tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
+    tp_mds, tp_vio = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr)
     print(time.time() - start)
+
+