完成Magellan实验

1 year ago · fe5c1288ed
parent f1e732afaf
commit fe5c1288ed
10 changed files with 151 additions and 961 deletions
--- a/draw/draw_md_cluster_with_data_point.py
+++ b/draw/draw_md_cluster_with_data_point.py
@ -1,73 +1,73 @@
-# 将数据点和MD一起聚类
-import os
-import numpy as np
-import pandas as pd
-from matplotlib import pyplot as plt
-
-from draw_md_cluster import DBSCAN
-from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
-
-
-def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_):
-    clusterNum = len(set(labels_))
-    fig = plt.figure()
-    scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
-    ax = fig.add_subplot(111, projection='3d')
-    for i in range(-1, clusterNum):
-        colorStyle = scatterColors[i % len(scatterColors)]
-        subCluster = md_data_[np.where(labels_ == i)]
-        ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
-    ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x')
-    if pre_mismatch_points_.shape[0] > 0:
-        ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x')
-    ax.set_xlabel(md_keys_[0], rotation=0)  # 设置标签角度
-    ax.set_ylabel(md_keys_[1], rotation=-45)
-    ax.set_zlabel(md_keys_[2], rotation=0)
-    plt.title(output_path_.split('\\')[-1].split('.')[0])
-    plt.savefig(output_path_, dpi=500)
-    plt.show()
-
-
-if __name__ == '__main__':
-    outcome_path = r'E:\Data\Research\Outcome'
-    config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
-    dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
-    for dataset_name in dataset_name_list:
-        absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt'  # MD路径
-        predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv'  # prediction路径
-        pred = pd.read_csv(predictions)
-        pred = pred.astype(str)
-        # pred = pred[pred['predicted'] == str(1)]
-        sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
-        # 选取的三个字段
-        md_keys = []
-        with open(absolute_path, 'r') as f:
-            # 读取每一行的md，加入该文件的md列表
-            md_data = []
-            for line in f.readlines():
-                md_metadata = line.strip().split('\t')
-                md_tuple = eval(md_metadata[1])
-                md_keys = list(md_tuple[0].keys())[1:4]
-                md_values = list(md_tuple[0].values())
-                md_data.append(md_values[1:4])
-                if len(md_data) == 10000:
-                    break
-
-        pre_match_points = []
-        pre_mismatch_points = []
-        for _ in pred.itertuples():
-            data_point_value = []
-            for key in md_keys:
-                sim_tensor = sim_tensor_dict[key]
-                data_point_value.append(round(float(sim_tensor[_[0]]), 4))
-            if getattr(_, 'predicted') == str(1):
-                pre_match_points.append(data_point_value)
-            elif getattr(_, 'predicted') == str(0):
-                pre_mismatch_points.append(data_point_value)
-
-        md_data = np.array(md_data, dtype=np.float32)
-        pre_match_points = np.array(pre_match_points, dtype=np.float32)
-        pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32)
-        labels = DBSCAN(md_data, 0.5, 30)
-        output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
-        plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path)
+# # 将数据点和MD一起聚类
+# import os
+# import numpy as np
+# import pandas as pd
+# from matplotlib import pyplot as plt
+#
+# from draw_md_cluster import DBSCAN
+# from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
+#
+#
+# def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_):
+#     clusterNum = len(set(labels_))
+#     fig = plt.figure()
+#     scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
+#     ax = fig.add_subplot(111, projection='3d')
+#     for i in range(-1, clusterNum):
+#         colorStyle = scatterColors[i % len(scatterColors)]
+#         subCluster = md_data_[np.where(labels_ == i)]
+#         ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
+#     ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x')
+#     if pre_mismatch_points_.shape[0] > 0:
+#         ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x')
+#     ax.set_xlabel(md_keys_[0], rotation=0)  # 设置标签角度
+#     ax.set_ylabel(md_keys_[1], rotation=-45)
+#     ax.set_zlabel(md_keys_[2], rotation=0)
+#     plt.title(output_path_.split('\\')[-1].split('.')[0])
+#     plt.savefig(output_path_, dpi=500)
+#     plt.show()
+#
+#
+# if __name__ == '__main__':
+#     outcome_path = r'E:\Data\Research\Outcome'
+#     config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
+#     dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
+#     for dataset_name in dataset_name_list:
+#         absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt'  # MD路径
+#         predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv'  # prediction路径
+#         pred = pd.read_csv(predictions)
+#         pred = pred.astype(str)
+#         # pred = pred[pred['predicted'] == str(1)]
+#         sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
+#         # 选取的三个字段
+#         md_keys = []
+#         with open(absolute_path, 'r') as f:
+#             # 读取每一行的md，加入该文件的md列表
+#             md_data = []
+#             for line in f.readlines():
+#                 md_metadata = line.strip().split('\t')
+#                 md_tuple = eval(md_metadata[1])
+#                 md_keys = list(md_tuple[0].keys())[1:4]
+#                 md_values = list(md_tuple[0].values())
+#                 md_data.append(md_values[1:4])
+#                 if len(md_data) == 10000:
+#                     break
+#
+#         pre_match_points = []
+#         pre_mismatch_points = []
+#         for _ in pred.itertuples():
+#             data_point_value = []
+#             for key in md_keys:
+#                 sim_tensor = sim_tensor_dict[key]
+#                 data_point_value.append(round(float(sim_tensor[_[0]]), 4))
+#             if getattr(_, 'predicted') == str(1):
+#                 pre_match_points.append(data_point_value)
+#             elif getattr(_, 'predicted') == str(0):
+#                 pre_mismatch_points.append(data_point_value)
+#
+#         md_data = np.array(md_data, dtype=np.float32)
+#         pre_match_points = np.array(pre_match_points, dtype=np.float32)
+#         pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32)
+#         labels = DBSCAN(md_data, 0.5, 30)
+#         output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
+#         plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path)
--- a/hpo/magellan_hpo.py
+++ b/hpo/magellan_hpo.py
@ -1,12 +1,14 @@
 import json
 import pickle
+import time

 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
 from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
 from ConfigSpace.read_and_write import json as csj
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
-from smac import HyperparameterOptimizationFacade, Scenario
+from colorama import Fore, init
+from smac import HyperparameterOptimizationFacade, Scenario, BlackBoxFacade

 from ml_er.magellan_er import matching
 from settings import *
@ -61,9 +63,7 @@ class Optimization:

    def train(self, config: Configuration, seed: int = 0) -> float:
        cm.del_catalog()
-        with open(er_output_dir + "blocking_result.pickle", "rb") as file:
-            blocking_result = pickle.load(file)
-        indicators = matching(config, blocking_result)
+        indicators = matching(config)
        return 1 - indicators['performance']


@ -73,20 +73,20 @@ def ml_er_hpo():
    str_configspace = csj.write(cs)
    dict_configspace = json.loads(str_configspace)
    # 将超参数空间保存本地
-    with open(hpo_output_dir + "configspace.json", "w") as f:
+    with open(hpo_output_dir + r"\configspace.json", "w") as f:
        json.dump(dict_configspace, f, indent=4)

    scenario = Scenario(
        cs,
        crash_cost=1.0,
        deterministic=True,
-        n_trials=20,
+        n_trials=16,
        n_workers=1
    )

-    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
+    initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)

-    smac = HyperparameterOptimizationFacade(
+    smac = BlackBoxFacade(
        scenario,
        optimization.train,
        initial_design=initial_design,
@ -97,19 +97,21 @@ def ml_er_hpo():
    incumbent_cost = smac.validate(incumbent)
    default = cs.get_default_configuration()
    default_cost = smac.validate(default)
-    print(f"Default Cost: {default_cost}")
-    print(f"Incumbent Cost: {incumbent_cost}")
+    print(Fore.BLUE + f"Default Cost: {default_cost}")
+    print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")

    if incumbent_cost > default_cost:
        incumbent = default
-        print(f"Updated Incumbent Cost: {default_cost}")
+        print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')

-    print(f"Optimized Configuration:{incumbent.values()}")
+    print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")

-    with open(hpo_output_dir + "incumbent.json", "w") as f:
+    with open(hpo_output_dir + r"\incumbent.json", "w") as f:
        json.dump(dict(incumbent), f, indent=4)
    return incumbent


 if __name__ == '__main__':
+    init(autoreset=True)
+    print(Fore.CYAN + f'Start Time: {time.time()}')
    ml_er_hpo()
--- a/md_discovery/discovery_executor.py
+++ b/md_discovery/discovery_executor.py
@ -1,200 +0,0 @@
-import operator
-
-import numpy as np
-import pandas as pd
-import copy
-import torch
-from ConfigSpace import Configuration
-from tqdm import tqdm
-from settings import model, similarity_threshold, support_threshold, confidence_threshold
-
-
-def is_minimal(md, md_list, target_col):
-    # 假设这个md是minimal
-    if len(md_list) == 0:
-        return True
-    minimal = True
-    for _ in md_list:
-        if isinstance(_, tuple):
-            _ = _[0]
-        if _ != md:
-            other_cols = list(set(_.keys()) - {target_col})
-            # 假设列表中每一个md都使当前md不minimal
-            exist = True
-            # 如果左边任何一个大于，则假设不成立
-            for col in other_cols:
-                if _[col] > md[col]:
-                    exist = False
-                    break
-            # 如果右边小于，假设也不成立
-            if _[target_col] < md[target_col]:
-                exist = False
-            # 任何一次假设成立，当前md不minimal
-            if exist:
-                minimal = False
-                break
-    return minimal
-
-
-def pairs_inference(path, target_col):
-    data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
-    data.fillna("", inplace=True)
-    data = data.astype(str)
-    columns = data.columns.values.tolist()
-    target_index = columns.index(target_col)
-    cols_but_target = list(set(columns) - {target_col})
-    length = data.shape[0]
-    width = data.shape[1]
-
-    sentences = []
-    for col in range(0, width):
-        for row in range(0, length):
-            cell_value = data.values[row, col]
-            sentences.append(cell_value)
-    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
-    split_embedding = torch.split(embedding, length, dim=0)
-    table_tensor = torch.stack(split_embedding, dim=0, out=None)
-    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
-    sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
-    # sim_tensor = torch.round(sim_tensor, decimals=3)
-
-    # torch.save(sim_tensor, md_output_dir + "tensor.pt")
-
-    md_list = []
-    minimal_vio = []
-    init_md = {}
-    for col in columns:
-        init_md[col] = 1 if col == target_col else -1
-    md_list.append(init_md)
-
-    for row1 in tqdm(range(0, length - 1)):
-        terminate = False
-        for row2 in range(row1 + 1, length):
-            violated_mds = []
-            # sims是两行的相似度
-            sims = {}
-            for col_index in range(0, width):
-                col = columns[col_index]
-                similarity = sim_tensor[col_index, row1, row2].item()
-                sims[col] = similarity
-
-            # 寻找violated md,从md列表中删除并加入vio列表
-            for md in md_list[:]:
-                lhs_satis = True
-                rhs_satis = True
-                for col in cols_but_target:
-                    if sims[col] < md[col]:
-                        lhs_satis = False
-                        break
-                if sims[target_col] < md[target_col]:
-                    rhs_satis = False
-                if lhs_satis == True and rhs_satis == False:
-                    md_list.remove(md)
-                    violated_mds.append(md)
-
-            # for vio_md in violated_mds:
-            #     # 特殊化左侧
-            #     for col in cols_but_target:
-            #         if sims[col] + 0.01 <= 1:
-            #             spec_l_md = copy.deepcopy(vio_md)
-            #             spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
-            #             if is_minimal(spec_l_md, md_list, target_col):
-            #                 md_list.append(spec_l_md)
-            #     if vio_md not in minimal_vio:
-            #         minimal_vio.append(vio_md)
-
-            for vio_md in violated_mds:
-                vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
-                if vio_md_support >= support_threshold:
-                    for col in cols_but_target:
-                        if sims[col] < 1.0:
-                            spec_l_md = copy.deepcopy(vio_md)
-                            if sims[col] < similarity_threshold:
-                                spec_l_md[col] = similarity_threshold
-                            else:
-                                if sims[col] + 0.01 <= 1.0:
-                                    spec_l_md[col] = sims[col] + 0.01
-                                else:
-                                    spec_l_md[col] = 1.0
-                            if is_minimal(spec_l_md, md_list, target_col):
-                                md_list.append(spec_l_md)
-                    if vio_md not in minimal_vio:
-                        minimal_vio.append(vio_md)
-
-            if len(md_list) == 0:
-                terminate = True
-                break
-        if terminate:
-            break
-
-    if len(minimal_vio) > 0:
-        for md in minimal_vio[:]:
-            support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
-            if support >= support_threshold and confidence >= confidence_threshold:
-                minimal_vio.append((md, support, confidence))
-            minimal_vio.remove(md)
-
-    if len(md_list) > 0:
-        # 去除重复MD
-        tmp = []
-        for _ in md_list:
-            if _ not in tmp:
-                tmp.append(_)
-        md_list = tmp
-        # 去除support小于阈值MD
-        for _ in md_list[:]:
-            support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
-            if support >= support_threshold and confidence >= confidence_threshold:
-                md_list.append((_, support, confidence))
-            md_list.remove(_)
-        # 去除不minimal的MD
-        for md_tuple in md_list[:]:
-            if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5:
-                md_list.remove(md_tuple)
-        if len(minimal_vio) > 0:
-            for vio_tuple in minimal_vio[:]:
-                if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5:
-                    minimal_vio.remove(vio_tuple)
-
-    if len(minimal_vio) > 0:
-        for vio_tuple in minimal_vio[:]:
-            if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5:
-                minimal_vio.remove(vio_tuple)
-
-    result = []
-    result.extend(md_list)
-    result.extend(minimal_vio)
-    result.sort(key=operator.itemgetter(2), reverse=True)
-    print(f'\033[33mList Length: {len(result)}\033[0m')
-    return result
-
-
-def get_metrics(current_md, data, sim_tensor, target_col, target_index):
-    columns = data.columns.values.tolist()
-    length = data.shape[0]
-    width = data.shape[1]
-
-    md_tensor = list(current_md.values())
-    md_tensor = torch.tensor(md_tensor, device='cuda')
-    md_tensor_2d = md_tensor.unsqueeze(1)
-    md_tensor_3d = md_tensor_2d.unsqueeze(2)
-    md_tensor_3d = md_tensor_3d.repeat(1, length, length)
-
-    sim_tensor = torch.round(sim_tensor, decimals=4)
-    sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
-    ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
-    for i in range(0, width):
-        if i != target_index:
-            sup_tensor_slice = sup_tensor[i]
-            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
-    sup_tensor_int = ini_slice.int()
-    support_Naumann = torch.count_nonzero(sup_tensor_int).item()
-    support_Naumann = (support_Naumann - length) / 2
-
-    ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
-    conf_tensor_int = ini_slice.int()
-    support_Fan = torch.count_nonzero(conf_tensor_int).item()
-    support_Fan = (support_Fan - length) / 2
-    confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
-
-    return support_Fan, confidence
--- a/md_discovery/discovery_executor_gpu.py
+++ b/md_discovery/discovery_executor_gpu.py
@ -1,138 +0,0 @@
-import math
-import operator
-import random
-import time
-from tqdm import tqdm
-import numpy as np
-import pandas as pd
-import torch
-
-from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
-
-sample_number = 100000
-step_length = 0.01
-
-def get_metrics(md_tensor, data, sim_tensor, target_index):
-    length = data.shape[0]
-    width = data.shape[1]
-
-    # md_tensor = list(current_md.values())
-    # md_tensor = torch.tensor(md_tensor, device='cuda')
-    md_tensor_2d = md_tensor.unsqueeze(1)
-    md_tensor_3d = md_tensor_2d.unsqueeze(2)
-    md_tensor_3d = md_tensor_3d.repeat(1, length, length)
-
-    sim_tensor = torch.round(sim_tensor, decimals=4)
-
-    sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
-    ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
-    for i in range(0, width):
-        if i != target_index:
-            sup_tensor_slice = sup_tensor[i]
-            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
-    sup_tensor_int = ini_slice.int()
-    support_Naumann = torch.count_nonzero(sup_tensor_int).item()
-    support_Naumann = (support_Naumann - length) / 2
-
-    conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
-    conf_tensor_int = conf_tensor.int()
-    support_Fan = torch.count_nonzero(conf_tensor_int).item()
-    support_Fan = (support_Fan - length) / 2
-    confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
-
-    return support_Fan, confidence
-
-
-def build_cartesian(width, target_index):
-    all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
-                                   num=math.ceil((1-similarity_threshold)/step_length) + 1)
-    all_values_array = np.round(all_values_array, 4)
-    all_values_tensor = torch.tensor(all_values_array, device='cuda')
-    all_values_tensor = all_values_tensor.float()
-    all_values_tensor = torch.round(all_values_tensor, decimals=4)
-    tensors_for_cartesian = []
-    for i in range(0, width):
-        if i == target_index:
-            t = torch.tensor([1.0], device='cuda')
-            tensors_for_cartesian.append(t)
-        else:
-            tensors_for_cartesian.append(all_values_tensor)
-    result = torch.cartesian_prod(*tensors_for_cartesian)
-    return result
-
-
-def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
-    length = data.shape[0]
-    width = data.shape[1]
-    cartesian_product = cartesian_product.unsqueeze(2)
-    cartesian_product = cartesian_product.unsqueeze(3)
-    cartesian_product = cartesian_product.repeat(1, 1, length, length)
-
-
-def discover(path, target_col):
-    data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
-    data.fillna("", inplace=True)
-    data = data.astype(str)
-    columns = data.columns.values.tolist()
-    target_index = columns.index(target_col)
-    cols_but_target = list(set(columns) - {target_col})
-    length = data.shape[0]
-    width = data.shape[1]
-    # 除了目标列外所有列的索引
-    columns_indices = [_ for _ in range(0, width) if _ != target_index]
-
-    sentences = []
-    for col in range(0, width):
-        for row in range(0, length):
-            cell_value = data.values[row, col]
-            sentences.append(cell_value)
-    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
-    split_embedding = torch.split(embedding, length, dim=0)
-    table_tensor = torch.stack(split_embedding, dim=0, out=None)
-    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
-    sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
-    sim_tensor = sim_tensor.float()
-    sim_tensor = torch.round(sim_tensor, decimals=4)
-
-    # 小于6列的可以尝试做笛卡尔积，大于6列可能指数爆炸
-    if width < 6:
-        # 列出除目标列以外所有列的所有取值，做笛卡尔积，结果为所有可能MD取值
-        cartesian = build_cartesian(width, target_index)
-        # 抽取sample_number / (width - 1)条MD，不含-1
-        if cartesian.shape[0] > sample_number / (width - 1):
-            index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
-            cartesian = torch.index_select(cartesian, 0, index)
-    else:
-        # 随机生成sample_number / (width - 1)条MD，使用randint先转化为int再除成小数，不含-1
-        cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
-                                  (math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
-        cartesian = cartesian / 100
-        # 生成一列相似度为1的目标列，插入目标列所在位置
-        ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
-        cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
-        cartesian = torch.round(cartesian, decimals=4)
-    # 此tensor将与其他置为-1的tensor拼接
-    joint_md_tensor = cartesian.clone()
-    # 随机将1列，2列……置为-1
-    for i in range(width - 2):
-        index_list_format = []
-        for j in range(cartesian.shape[0]):
-            # 对每条MD，随机选择将要置为-1的列索引
-            index_list_format.append(random.sample(columns_indices, i + 1))
-        index = torch.tensor(index_list_format, device='cuda')
-        # 随机调整为-1后的MD集合
-        modified_cartesian = cartesian.scatter(1, index, -1)
-        joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
-
-    md_list = []
-    # get_metric_tensor(cartesian, data, sim_tensor, target_index)
-    for _ in tqdm(range(joint_md_tensor.shape[0])):
-        s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
-        if s >= support_threshold and c >= confidence_threshold:
-            md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
-            md_dict_format = {}
-            for k in range(0, width):
-                md_dict_format[columns[k]] = md_list_format[k]
-            md_list.append((md_dict_format, s, c))
-    md_list.sort(key=operator.itemgetter(2), reverse=True)
-    return md_list
--- a/md_discovery/md_discover.py
+++ b/md_discovery/md_discover.py
@ -1,27 +0,0 @@
-from ConfigSpace import Configuration
-
-from md_discovery.discovery_executor import pairs_inference
-from md_discovery.discovery_executor_gpu import discover
-from settings import *
-
-# # 若不输出support和confidence，使用以下两块代码
-# # 将列表1写入本地，路径需自己修改
-# md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt'
-# with open(md_path, 'w') as f:
-#     for _ in mds:
-#         f.write(str(_) + '\n')
-#
-# # 将列表2写入本地，路径需自己修改
-# vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt'
-# with open(vio_path, 'w') as f:
-#     for _ in vio:
-#         f.write(str(_) + '\n')
-
-
-def md_discover(config: Configuration, source_path, target_path):
-    mds_list = discover(source_path, target_attr)
-    with open(target_path, 'w') as f:
-        for _ in mds_list:
-            f.write('Target:'+str(target_attr) + '\t')
-            f.write(str(_))
-            f.write('\n')
--- a/md_discovery/md_mining.py
+++ b/md_discovery/md_mining.py
@ -1,4 +1,5 @@
 import itertools
+import pickle
 import random
 import operator
 from operator import itemgetter
@ -16,10 +17,12 @@ from settings import *
 def mining(train: pd.DataFrame):
    # data is train set, in which each row represents a tuple pair
    train = train.astype(str)
+    # 将label列移到最后
+    train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1)

    # 尝试不将左右表key手动调整相同，而是只看gold属性是否为1
    # 故将左右表key直接去除
-    data = train.drop(columns=['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id], inplace=False)
+    data = train.drop(columns=['_id', 'ltable_id', 'rtable_id'], inplace=False)
    # data中现存属性：除key以外左右表属性和gold, 不含_id
    columns = data.columns.values.tolist()
    columns_without_prefix = [_.replace('ltable_', '') for _ in columns if _.startswith('ltable_')]
@ -62,7 +65,7 @@ def mining(train: pd.DataFrame):
    # 生成带标签的相似度张量
    sim_table_tensor_labeled = torch.cat((sim_table_tensor, label_tensor), 1)
    # 找到匹配元组对的行索引
-    mask = (data['gold'].isin(['1']))
+    mask = (data['label'].isin(['1']))
    match_pair_indices = data[mask].index.tolist()
    # 根据索引将匹配的行标签置为1
    sim_table_tensor_labeled[match_pair_indices, -1] = 1.00
@ -209,7 +212,7 @@ def build_candidate_md_matrix(sorted_unique_value_tensor_list_: list):


 def mds_to_txt(result_list_):
-    p = md_output_dir + "mds.txt"
+    p = md_output_dir + r"\mds.txt"
    with open(p, 'w') as f:
        for _ in result_list_:
            f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}')
@ -253,3 +256,10 @@ def merge_mds(md_list_):
            del same_sc_list[index]
    # 二级列表转一级列表
    return list(itertools.chain.from_iterable(grouped_md_tuples))
+
+
+if __name__ == '__main__':
+    _train = pd.read_csv(directory_path + r'\train_whole.csv')
+    result = mining(_train)
+    with open(md_output_dir + r"\mds.pickle", "wb") as file_:
+        pickle.dump(result, file_)
--- a/ml_er/magellan_er.py
+++ b/ml_er/magellan_er.py
@ -11,89 +11,20 @@ from ConfigSpace import Configuration
 from ConfigSpace.read_and_write import json as csj
 import py_entitymatching.catalog.catalog_manager as cm
 from tqdm import tqdm
+from colorama import Fore

-from md_discovery.md_mining import mining
 from settings import *


-def blocking_mining():
-    start = time.time()
-    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
-    cm.set_key(ltable, ltable_id)
-    rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
-    cm.set_key(rtable, rtable_id)
-    mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
-    matching_number = len(mappings)
-    # if ltable_id == rtable_id:
-    #     tables_id = rtable_id
-    attributes = ltable.columns.values.tolist()
-    # lattributes = ['ltable_' + i for i in attributes]
-    # rattributes = ['rtable_' + i for i in attributes]
-    cm.set_key(ltable, ltable_id)
-    cm.set_key(rtable, rtable_id)
+def matching(config: Configuration):
+    print(Fore.BLUE + f'Config: {config}')
+    with open(md_output_dir + r"\mds.pickle", "rb") as file:
+        md_list = pickle.load(file)

-    blocker = em.OverlapBlocker()
-    candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
-                                     l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
-                                     overlap_size=1, show_progress=False)
-    candidate['gold'] = 0
-    candidate = candidate.reset_index(drop=True)
-    block_time = time.time()
-    print(f'Block Time: {block_time - start}')
-
-    # 根据mapping表标注数据
-    candidate_match_rows = []
-    for t in tqdm(mappings.itertuples()):
-        mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, mapping_lid)])) &
-                (candidate['rtable_' + rtable_id].isin([getattr(t, mapping_rid)])))
-        matching_indices = candidate[mask].index
-        candidate_match_rows.extend(matching_indices.tolist())
-    match_rows_mask = candidate.index.isin(candidate_match_rows)
-    candidate.loc[match_rows_mask, 'gold'] = 1
-    candidate.fillna(value="", inplace=True)
-
-    # negative样本太多, 采样三倍于positive样本量
-    candidate_mismatch = candidate[candidate['gold'] == 0]
-    candidate_match = candidate[candidate['gold'] == 1]
-    candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))
-    candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
-    # 如果拼接后不重设索引可能导致索引重复
-    candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
-    cm.set_key(candidate_for_train_test, '_id')
-    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
-    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
-    cm.set_ltable(candidate_for_train_test, ltable)
-    cm.set_rtable(candidate_for_train_test, rtable)
-    block_recall = len(candidate_match) / matching_number
-
-    # 分为训练测试集
-    train_proportion = 0.5
-    sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
-    train_set = sets['train']
-    test_set = sets['test']
-    label_and_split_time = time.time()
-    print(f'Label and Split Time: {label_and_split_time - block_time}')
-
-    # 挖掘MD并保存本地
-    md_list = mining(train_set)
-    mining_time = time.time()
-    print(f'Mining Time: {mining_time - label_and_split_time}')
-    blocking_results = (ltable, rtable, train_set, test_set, md_list, block_recall)
-    # 将blocking结果保存到本地
-    with open(er_output_dir + "blocking_result.pickle", "wb") as file_:
-        pickle.dump(blocking_results, file_)
-    return blocking_results
-
-
-def matching(config: Configuration, blocking_result_):
-    print(f'\033[33mConfig: {config}\033[0m')
-    start = time.time()
-    ltable = blocking_result_[0]
-    rtable = blocking_result_[1]
-    train_set = blocking_result_[2]
-    test_set = blocking_result_[3]
-    md_list = blocking_result_[4]
-    block_recall = blocking_result_[5]
+    train_set = pd.read_csv(directory_path + r'\train_whole.csv', encoding='ISO-8859-1')
+    test_set = pd.read_csv(directory_path + r'\test_whole.csv', encoding='ISO-8859-1')
+    ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1')
+    rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1')
    ml_matcher = config["ml_matcher"]
    match ml_matcher:
        case "dt":
@ -109,22 +40,22 @@ def matching(config: Configuration, blocking_result_):
                                   max_features=config['rf_max_features'])

    cm.set_key(train_set, '_id')
-    cm.set_fk_ltable(train_set, 'ltable_' + ltable_id)
-    cm.set_fk_rtable(train_set, 'rtable_' + rtable_id)
+    cm.set_fk_ltable(train_set, 'ltable_id')
+    cm.set_fk_rtable(train_set, 'rtable_id')
    cm.set_ltable(train_set, ltable)
    cm.set_rtable(train_set, rtable)
-    cm.set_key(ltable, ltable_id)
-    cm.set_key(rtable, rtable_id)
+    cm.set_key(ltable, 'id')
+    cm.set_key(rtable, 'id')

    cm.set_key(test_set, '_id')
-    cm.set_fk_ltable(test_set, 'ltable_' + ltable_id)
-    cm.set_fk_rtable(test_set, 'rtable_' + rtable_id)
+    cm.set_fk_ltable(test_set, 'ltable_id')
+    cm.set_fk_rtable(test_set, 'rtable_id')
    cm.set_ltable(test_set, ltable)
    cm.set_rtable(test_set, rtable)
    feature_table = em.get_features_for_matching(ltable, rtable, validate_inferred_attr_types=False)
    train_feature_vecs = em.extract_feature_vecs(train_set,
                                                 feature_table=feature_table,
-                                                 attrs_after=['gold'],
+                                                 attrs_after=['label'],
                                                 show_progress=False)
    train_feature_vecs.fillna(value=0, inplace=True)

@ -132,22 +63,21 @@ def matching(config: Configuration, blocking_result_):
    for _ in test_feature_after[:]:
        test_feature_after.append(_.replace('ltable_', 'rtable_'))
    for _ in test_feature_after:
-        if _.endswith(ltable_id) or _.endswith(rtable_id):
+        if _.endswith('id'):
            test_feature_after.remove(_)
-    test_feature_after.append('gold')
+    test_feature_after.append('label')
    test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
                                                attrs_after=test_feature_after, show_progress=False)
    test_feature_vecs.fillna(value=0, inplace=True)

-    fit_exclude = ['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id, 'gold']
-    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
-    test_feature_after.extend(['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id])
+    fit_exclude = ['_id', 'ltable_id', 'rtable_id', 'label']
+    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='label')
+    test_feature_after.extend(['_id', 'ltable_id', 'rtable_id'])
    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
                                  append=True, target_attr='predicted', inplace=False)
-    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
+    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    em.print_eval_summary(eval_result)
-    indicators = evaluate_prediction(predictions, 'gold', 'predicted')
-    indicators['block_recall'] = block_recall
+    indicators = evaluate_prediction(predictions, 'label', 'predicted')

    test_feature_after.remove('_id')
    test_feature_after.append('predicted')
@ -158,31 +88,26 @@ def matching(config: Configuration, blocking_result_):
    # 目前predictions包含的属性：左右表全部属性+gold+predicted
    sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
    predictions['confidence'] = 0
+    predictions['md'] = ''

    epl_match = 0  # 可解释，预测match
    if len(md_list) > 0:
        for row in tqdm(predictions.itertuples()):
-            x = is_explicable(row, md_list, sim_tensor_dict)
-            if x > 0 and str(getattr(row, 'predicted')) == str(1):
-                predictions.loc[row[0], 'confidence'] = x
+            if str(getattr(row, 'predicted')) == str(1):
+                conf, md_dict = is_explicable(row, md_list, sim_tensor_dict)
+                if conf > 0:
+                    predictions.loc[row[0], 'confidence'] = conf
+                    predictions.loc[row[0], 'md'] = str(md_dict)
                    epl_match += 1

    df = predictions[predictions['predicted'] == str(1)]
    interpretability = epl_match / len(df)  # 可解释性
    indicators['interpretability'] = interpretability
-
-    # note 既然不调block参数, 不妨假设block_recall很高, 不必考虑
-    # if indicators["block_recall"] < indicators["recall"]:
-    #     f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
-    #                 indicators["precision"] + indicators["block_recall"])
-    # else:
-    #     f1 = indicators["F1"]
-
    performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"]
    indicators['performance'] = performance
-    print(f'ER Indicators: {indicators}')
-    predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
-    print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m')
+    print(Fore.BLUE + f'ER Indicators: {indicators}')
+    predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True)
+    print(Fore.CYAN + f'Finish Time: {time.time()}')
    return indicators


@ -223,7 +148,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
            col_tuple_list.append((left_index, right_index))

    length = predictions.shape[0]
-    width = predictions.shape[1]
+    # width = predictions.shape[1]
    predictions = predictions.reset_index(drop=True)
    sentences = predictions.values.flatten(order='F').tolist()

@ -238,7 +163,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
        rattr_tensor = norm_table_tensor[col_tuple[1]]
        mul_tensor = lattr_tensor * rattr_tensor
        sim_tensor = torch.sum(mul_tensor, 1)
-        sim_tensor = torch.round(sim_tensor, decimals=4)
+        sim_tensor = torch.round(sim_tensor, decimals=2)
        sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
    return sim_tensor_dict

@ -252,31 +177,28 @@ def is_explicable(row, all_mds: list, st_dict):
                explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
        if explicable:
-            return md_tuple[2]  # 任意一条md能解释，直接返回
-    return -1.0  # 遍历结束，不能解释
+            return md_tuple[2], md_tuple[0]  # 任意一条md能解释，直接返回
+    return -1.0, {}  # 遍历结束，不能解释


-def ml_er(config: Configuration, blocking_result_):
-    indicators = matching(config, blocking_result_)
-    output_path = er_output_dir + "eval_result.txt"
+def ml_er(config: Configuration):
+    indicators = matching(config)
+    output_path = er_output_dir + r"\eval_result.txt"
    with open(output_path, 'w') as _f:
        _f.write('Precision:' + str(indicators["precision"]) + '\n')
        _f.write('Recall:' + str(indicators["recall"]) + '\n')
        _f.write('F1:' + str(indicators["F1"]) + '\n')
-        _f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
        _f.write('interpretability:' + str(indicators['interpretability']) + '\n')
        _f.write('performance:' + str(indicators['performance']) + '\n')


 if __name__ == '__main__':
-    if os.path.isfile(hpo_output_dir + "incumbent.json"):
-        with open(hpo_output_dir + "configspace.json", 'r') as f:
+    if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
+        with open(hpo_output_dir + r"\configspace.json", 'r') as f:
            dict_configspace = json.load(f)
        str_configspace = json.dumps(dict_configspace)
        configspace = csj.read(str_configspace)
-        with open(hpo_output_dir + "incumbent.json", 'r') as f:
+        with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
            dic = json.load(f)
        configuration = ConfigSpace.Configuration(configspace, values=dic)
-        with open(er_output_dir + "blocking_result.pickle", "rb") as file:
-            blocking_result = pickle.load(file)
-        ml_er(configuration, blocking_result)
+        ml_er(configuration)
--- a/ml_er/magellan_start.py
+++ b/ml_er/magellan_start.py
@ -1,4 +0,0 @@
-from ml_er.magellan_er import blocking_mining
-
-if __name__ == '__main__':
-    blocking_mining()
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -1,363 +0,0 @@
-import json
-import os
-import sys
-import time
-
-import ConfigSpace
-import pandas
-import torch
-from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
-from ConfigSpace.read_and_write import json as csj
-import py_entitymatching as em
-import py_entitymatching.catalog.catalog_manager as cm
-import pandas as pd
-import six
-from ConfigSpace import Configuration
-from tqdm import tqdm
-
-from md_discovery.md_discover import md_discover
-from settings import *
-
-
-def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"):
-    df = train[train['gold'] == 1]
-    # 元组对左右ID调整一致
-    for index, row in df.iterrows():
-        df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
-
-    train_columns = train.columns.values.tolist()
-    l_columns = []
-    r_columns = []
-    cols = []
-    # 左表和右表字段名分别加入两个列表
-    for _ in train_columns:
-        if _.startswith('ltable'):
-            l_columns.append(_)
-        elif _.startswith('rtable'):
-            r_columns.append(_)
-    # 将左表中字段名去掉前缀，作为统一的字段名列表(前提是两张表内对应字段名调整一致)
-    for _ in l_columns:
-        cols.append(_.replace('ltable_', ''))
-
-    ldf = df[l_columns]
-    rdf = df[r_columns]
-    ldf.columns = cols
-    rdf.columns = cols
-    t_single_tuple = pd.concat([ldf, rdf])
-    t_single_tuple = t_single_tuple.reset_index(drop=True)
-
-    t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
-
-
-def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
-                        candidate: pd.DataFrame) -> dict:
-    new_df = df.reset_index(drop=False, inplace=False)
-    gold = new_df[labeled_attr]
-    predicted = new_df[predicted_attr]
-    gold_negative = gold[gold == 0].index.values
-    gold_positive = gold[gold == 1].index.values
-    predicted_negative = predicted[predicted == 0].index.values
-    predicted_positive = predicted[predicted == 1].index.values
-
-    false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
-    true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
-    false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
-
-    num_true_positives = float(len(true_positive_indices))
-    num_false_positives = float(len(false_positive_indices))
-    num_false_negatives = float(len(false_negative_indices))
-
-    precision_denominator = num_true_positives + num_false_positives
-    recall_denominator = num_true_positives + num_false_negatives
-
-    precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
-    recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
-    F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
-    block_recall = len(candidate[candidate['gold'] == 1]) / matching_number
-
-    return {"precision": precision, "recall": recall, "F1": F1, "block_recall": block_recall}
-
-
-def load_mds(paths: list) -> list:
-    if len(paths) == 0:
-        return []
-    all_mds = []
-    # 传入md路径列表
-    for md_path in paths:
-        if not os.path.exists(md_path):
-            continue
-        mds = []
-        # 打开每一个md文件
-        with open(md_path, 'r') as f:
-            # 读取每一行的md，加入该文件的md列表
-            for line in f.readlines():
-                md_metadata = line.strip().split('\t')
-                # todo 如果MD文件的形式改了 这里也要改
-                md = eval(md_metadata[1])
-                mds.append(md)
-        all_mds.extend(mds)
-    return all_mds
-
-
-def is_explicable(row, all_mds: list, st_dict):
-    attrs = all_mds[0][0].keys()  # 从第一条md_tuple中的md字典中读取所有字段
-    for md_tuple in all_mds:
-        explicable = True  # 假设这条md能解释当前元组
-        for a in attrs:
-            if a != target_attr:
-                if st_dict[a][row[0]].item() < md_tuple[0][a]:
-                    explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
-                    break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
-        if explicable:
-            return md_tuple[2]  # 任意一条md能解释，直接返回
-    return -1.0  # 遍历结束，不能解释
-
-
-# 形成一个字典，key为字段名称，value为一维张量，记录了预测表中这一字段每行的左右属性的相似度
-def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
-    predictions_attrs = predictions.columns.values.tolist()
-    col_tuple_list = []
-    for _ in predictions_attrs:
-        if _.startswith('ltable'):
-            left_index = predictions_attrs.index(_)
-            right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
-            col_tuple_list.append((left_index, right_index))
-
-    length = predictions.shape[0]
-    width = predictions.shape[1]
-    sentences = []
-    for col in range(0, width):
-        for row in range(0, length):
-            cell_value = predictions.values[row, col]
-            sentences.append(cell_value)
-    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
-    split_embedding = torch.split(embedding, length, dim=0)
-    table_tensor = torch.stack(split_embedding, dim=0, out=None)
-    # prediction的归一化嵌入张量
-    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
-    sim_tensor_dict = {}
-    for col_tuple in col_tuple_list:
-        lattr_tensor = norm_table_tensor[col_tuple[0]]
-        rattr_tensor = norm_table_tensor[col_tuple[1]]
-        mul_tensor = lattr_tensor * rattr_tensor
-        sim_tensor = torch.sum(mul_tensor, 1)
-        sim_tensor = torch.round(sim_tensor, decimals=4)
-        sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
-    return sim_tensor_dict
-
-
-def er_process(config: Configuration):
-    start = time.time()
-    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
-    cm.set_key(ltable, ltable_id)
-    # ltable.fillna("", inplace=True)
-    rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
-    cm.set_key(rtable, rtable_id)
-    # rtable.fillna("", inplace=True)
-    mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
-
-    # 仅保留两表中出现在映射表中的行，增大正样本比例
-    lid_mapping_list = []
-    rid_mapping_list = []
-    # 全部转为字符串
-    # ltable = ltable.astype(str)
-    # rtable = rtable.astype(str)
-    # mappings = mappings.astype(str)
-    matching_number = len(mappings)  # 所有阳性样本数
-
-    for index, row in mappings.iterrows():
-        lid_mapping_list.append(row[mapping_lid])
-        rid_mapping_list.append(row[mapping_rid])
-
-    selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
-    tables_id = rtable_id
-    selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
-    selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
-    attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]
-    attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]
-    cm.set_key(selected_ltable, tables_id)
-    cm.set_key(selected_rtable, tables_id)
-
-    blocker = em.OverlapBlocker()
-    candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
-                                     config["block_attr"], allow_missing=True,
-                                     l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-                                     overlap_size=1, show_progress=False)
-
-    candidate['gold'] = 0
-    candidate = candidate.reset_index(drop=True)
-    candidate_match_rows = []
-    for row in candidate.itertuples():
-        l_id = getattr(row, 'ltable_' + tables_id)
-        map_row = mappings[mappings[mapping_lid] == l_id]
-
-        if map_row is not None:
-            r_id = map_row[mapping_rid]
-            for value in r_id:
-                if value == getattr(row, 'rtable_' + tables_id):
-                    candidate_match_rows.append(row[0])
-        else:
-            continue
-
-    for _ in candidate_match_rows:
-        candidate.loc[_, 'gold'] = 1
-
-    candidate.fillna(value="", inplace=True)
-
-    # 裁剪负样本，保持正负样本数量一致
-    candidate_mismatch = candidate[candidate['gold'] == 0]
-    candidate_match = candidate[candidate['gold'] == 1]
-    if len(candidate_mismatch) > len(candidate_match):
-        candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
-    # 拼接正负样本
-    candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
-    # if len(candidate_for_train_test) == 0:
-    #     return 0
-    # 如果拼接后不重设索引可能导致索引重复
-    candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
-    cm.set_key(candidate_for_train_test, '_id')
-    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
-    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
-    cm.set_ltable(candidate_for_train_test, selected_ltable)
-    cm.set_rtable(candidate_for_train_test, selected_rtable)
-    block_recall = len(candidate_match) / matching_number
-
-    # 分为训练测试集
-    train_proportion = 0.7
-    sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
-    train_set = sets['train']
-    test_set = sets['test']
-
-    # cm.set_key(train_set, '_id')
-    # cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
-    # cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
-    # cm.set_ltable(train_set, selected_ltable)
-    # cm.set_rtable(train_set, selected_rtable)
-    #
-    # cm.set_key(test_set, '_id')
-    # cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
-    # cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
-    # cm.set_ltable(test_set, selected_ltable)
-    # cm.set_rtable(test_set, selected_rtable)
-
-    ml_matcher = config["ml_matcher"]
-    if ml_matcher == "dt":
-        matcher = em.DTMatcher(name='DecisionTree', random_state=0)
-    elif ml_matcher == "svm":
-        matcher = em.SVMMatcher(name='SVM', random_state=0)
-    elif ml_matcher == "rf":
-        matcher = em.RFMatcher(name='RF', random_state=0)
-    elif ml_matcher == "lg":
-        matcher = em.LogRegMatcher(name='LogReg', random_state=0)
-    elif ml_matcher == "ln":
-        matcher = em.LinRegMatcher(name='LinReg')
-    elif ml_matcher == "nb":
-        matcher = em.NBMatcher(name='NaiveBayes')
-
-    feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
-
-    train_feature_vecs = em.extract_feature_vecs(train_set,
-                                                 feature_table=feature_table,
-                                                 attrs_after=['gold'],
-                                                 show_progress=False)
-    train_feature_vecs.fillna(value=0, inplace=True)
-
-    test_feature_after = attrs_with_l_prefix[:]
-    test_feature_after.extend(attrs_with_r_prefix)
-    for _ in test_feature_after:
-        if _.endswith(tables_id):
-            test_feature_after.remove(_)
-    test_feature_after.append('gold')
-    test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-                                                attrs_after=test_feature_after, show_progress=False)
-    test_feature_vecs.fillna(value=0, inplace=True)
-
-    fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
-    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
-    test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
-    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
-                                  append=True, target_attr='predicted', inplace=False)
-    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
-    em.print_eval_summary(eval_result)
-    indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, candidate_for_train_test)
-
-    # 计算可解释性
-    ################################################################################################################
-    predictions_attrs = []
-    predictions_attrs.extend(attrs_with_l_prefix)
-    predictions_attrs.extend(attrs_with_r_prefix)
-    predictions_attrs.extend(['gold', 'predicted'])
-    predictions = predictions[predictions_attrs]
-    # 必须从训练集内挖MD
-    train_attrs = predictions_attrs[:]
-    train_attrs.remove('predicted')
-    train_set = train_set[train_attrs]
-    prepare_file_for_md_discovery(train_set)
-    predictions = predictions.reset_index(drop=True)
-    predictions = predictions.astype(str)
-    sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
-    predictions['confidence'] = 0
-
-    md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt")
-    md_paths = [md_output_dir + 'mds.txt']
-    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
-    epl_match = 0  # 可解释，预测match
-
-    unexplainable = pd.DataFrame()
-
-    if len(md_list) > 0:
-        for row in tqdm(predictions.itertuples()):
-            x = is_explicable(row, md_list, sim_tensor_dict)
-            if x > 0 and str(getattr(row, 'predicted')) == str(1):
-                predictions.loc[row[0], 'confidence'] = x
-                epl_match += 1
-        #     else:
-        #         series = pd.Series(row)
-        #         unexplainable = unexplainable._append(series, ignore_index=True)
-        # unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True)
-        # unexplainable.columns = predictions_attrs
-        # unexplainable = unexplainable[train_attrs]
-        # if len(unexplainable[unexplainable['gold'] == str(1)]) > 0:
-        #     prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv')
-        #     md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt")
-
-    df = predictions[predictions['predicted'] == str(1)]
-    interpretability = epl_match / len(df)  # 可解释性
-    indicators['interpretability'] = interpretability
-    if indicators["block_recall"] < indicators["recall"]:
-        f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
-                    indicators["precision"] + indicators["block_recall"])
-    else:
-        f1 = indicators["F1"]
-    performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
-    indicators['performance'] = performance
-    indicators['eval_result'] = eval_result
-    print(indicators)
-    predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
-    ################################################################################################################
-    print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m')
-    return indicators
-
-
-def ml_er(config: Configuration = None):
-    indicators = er_process(config)
-    output_path = er_output_dir + "eval_result.txt"
-    with open(output_path, 'w') as f:
-        for key, value in six.iteritems(_get_metric(indicators['eval_result'])):
-            f.write(key + " : " + value)
-            f.write('\n')
-        f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
-        f.write('interpretability:' + str(indicators['interpretability']) + '\n')
-        f.write('performance:' + str(indicators['performance']) + '\n')
-
-
-if __name__ == '__main__':
-    if os.path.isfile(hpo_output_dir + "incumbent.json"):
-        with open(hpo_output_dir + "configspace.json", 'r') as f:
-            dict_configspace = json.load(f)
-        str_configspace = json.dumps(dict_configspace)
-        configspace = csj.read(str_configspace)
-        with open(hpo_output_dir + "incumbent.json", 'r') as f:
-            dic = json.load(f)
-        configuration = ConfigSpace.Configuration(configspace, values=dic)
-        ml_er(configuration)
--- a/settings.py
+++ b/settings.py
@ -1,24 +1,12 @@
 from sentence_transformers import SentenceTransformer

-ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv'
-rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
-mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
-mapping_lid = 'idAbt'  # mapping表中左表id名
-mapping_rid = 'idBuy'  # mapping表中右表id名
-ltable_block_attr = 'name'
-rtable_block_attr = 'name'
-ltable_id = 'id'  # 左表id字段名称
-rtable_id = 'id'  # 右表id字段名称
-target_attr = 'id'  # 进行md挖掘时的目标字段
-# lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
+directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon'

-model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
+er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\ml_er\output'
+md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\md_discovery\output'
+hpo_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\hpo\output'
+
+model = SentenceTransformer(r'E:\Data\Research\Models\all-MiniLM-L6-v2')
 interpre_weight = 0  # 可解释性权重
-similarity_threshold = 0.1
 support_threshold = 1
 confidence_threshold = 0.75
-
-er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
-md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
-hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'
-