完成Magellan实验

1 year ago · fe5c1288ed
parent f1e732afaf
commit fe5c1288ed
10 changed files with 151 additions and 961 deletions
--- a/draw/draw_md_cluster_with_data_point.py
+++ b/draw/draw_md_cluster_with_data_point.py
@ -1,73 +1,73 @@
-# 将数据点和MD一起聚类
+# # 将数据点和MD一起聚类
-import os
+# import os
-import numpy as np
+# import numpy as np
-import pandas as pd
+# import pandas as pd
-from matplotlib import pyplot as plt
+# from matplotlib import pyplot as plt
-
+#
-from draw_md_cluster import DBSCAN
+# from draw_md_cluster import DBSCAN
-from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
+# from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
-
+#
-
+#
-def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_):
+# def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_):
-    clusterNum = len(set(labels_))
+#     clusterNum = len(set(labels_))
-    fig = plt.figure()
+#     fig = plt.figure()
-    scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
+#     scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
-    ax = fig.add_subplot(111, projection='3d')
+#     ax = fig.add_subplot(111, projection='3d')
-    for i in range(-1, clusterNum):
+#     for i in range(-1, clusterNum):
-        colorStyle = scatterColors[i % len(scatterColors)]
+#         colorStyle = scatterColors[i % len(scatterColors)]
-        subCluster = md_data_[np.where(labels_ == i)]
+#         subCluster = md_data_[np.where(labels_ == i)]
-        ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
+#         ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
-    ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x')
+#     ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x')
-    if pre_mismatch_points_.shape[0] > 0:
+#     if pre_mismatch_points_.shape[0] > 0:
-        ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x')
+#         ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x')
-    ax.set_xlabel(md_keys_[0], rotation=0)  # 设置标签角度
+#     ax.set_xlabel(md_keys_[0], rotation=0)  # 设置标签角度
-    ax.set_ylabel(md_keys_[1], rotation=-45)
+#     ax.set_ylabel(md_keys_[1], rotation=-45)
-    ax.set_zlabel(md_keys_[2], rotation=0)
+#     ax.set_zlabel(md_keys_[2], rotation=0)
-    plt.title(output_path_.split('\\')[-1].split('.')[0])
+#     plt.title(output_path_.split('\\')[-1].split('.')[0])
-    plt.savefig(output_path_, dpi=500)
+#     plt.savefig(output_path_, dpi=500)
-    plt.show()
+#     plt.show()
-
+#
-
+#
-if __name__ == '__main__':
+# if __name__ == '__main__':
-    outcome_path = r'E:\Data\Research\Outcome'
+#     outcome_path = r'E:\Data\Research\Outcome'
-    config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
+#     config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
-    dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
+#     dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
-    for dataset_name in dataset_name_list:
+#     for dataset_name in dataset_name_list:
-        absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt'  # MD路径
+#         absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt'  # MD路径
-        predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv'  # prediction路径
+#         predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv'  # prediction路径
-        pred = pd.read_csv(predictions)
+#         pred = pd.read_csv(predictions)
-        pred = pred.astype(str)
+#         pred = pred.astype(str)
-        # pred = pred[pred['predicted'] == str(1)]
+#         # pred = pred[pred['predicted'] == str(1)]
-        sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
+#         sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
-        # 选取的三个字段
+#         # 选取的三个字段
-        md_keys = []
+#         md_keys = []
-        with open(absolute_path, 'r') as f:
+#         with open(absolute_path, 'r') as f:
-            # 读取每一行的md，加入该文件的md列表
+#             # 读取每一行的md，加入该文件的md列表
-            md_data = []
+#             md_data = []
-            for line in f.readlines():
+#             for line in f.readlines():
-                md_metadata = line.strip().split('\t')
+#                 md_metadata = line.strip().split('\t')
-                md_tuple = eval(md_metadata[1])
+#                 md_tuple = eval(md_metadata[1])
-                md_keys = list(md_tuple[0].keys())[1:4]
+#                 md_keys = list(md_tuple[0].keys())[1:4]
-                md_values = list(md_tuple[0].values())
+#                 md_values = list(md_tuple[0].values())
-                md_data.append(md_values[1:4])
+#                 md_data.append(md_values[1:4])
-                if len(md_data) == 10000:
+#                 if len(md_data) == 10000:
-                    break
+#                     break
-
+#
-        pre_match_points = []
+#         pre_match_points = []
-        pre_mismatch_points = []
+#         pre_mismatch_points = []
-        for _ in pred.itertuples():
+#         for _ in pred.itertuples():
-            data_point_value = []
+#             data_point_value = []
-            for key in md_keys:
+#             for key in md_keys:
-                sim_tensor = sim_tensor_dict[key]
+#                 sim_tensor = sim_tensor_dict[key]
-                data_point_value.append(round(float(sim_tensor[_[0]]), 4))
+#                 data_point_value.append(round(float(sim_tensor[_[0]]), 4))
-            if getattr(_, 'predicted') == str(1):
+#             if getattr(_, 'predicted') == str(1):
-                pre_match_points.append(data_point_value)
+#                 pre_match_points.append(data_point_value)
-            elif getattr(_, 'predicted') == str(0):
+#             elif getattr(_, 'predicted') == str(0):
-                pre_mismatch_points.append(data_point_value)
+#                 pre_mismatch_points.append(data_point_value)
-
+#
-        md_data = np.array(md_data, dtype=np.float32)
+#         md_data = np.array(md_data, dtype=np.float32)
-        pre_match_points = np.array(pre_match_points, dtype=np.float32)
+#         pre_match_points = np.array(pre_match_points, dtype=np.float32)
-        pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32)
+#         pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32)
-        labels = DBSCAN(md_data, 0.5, 30)
+#         labels = DBSCAN(md_data, 0.5, 30)
-        output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
+#         output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
-        plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path)
+#         plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path)
--- a/hpo/magellan_hpo.py
+++ b/hpo/magellan_hpo.py
@ -1,12 +1,14 @@
 import json
 import pickle
 import time
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
 from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
 from ConfigSpace.read_and_write import json as csj
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
-from smac import HyperparameterOptimizationFacade, Scenario
+from colorama import Fore, init
 from smac import HyperparameterOptimizationFacade, Scenario, BlackBoxFacade
 from ml_er.magellan_er import matching
 from settings import *
@ -61,9 +63,7 @@ class Optimization:
    def train(self, config: Configuration, seed: int = 0) -> float:
        cm.del_catalog()
-        with open(er_output_dir + "blocking_result.pickle", "rb") as file:
+        indicators = matching(config)
            blocking_result = pickle.load(file)
        indicators = matching(config, blocking_result)
        return 1 - indicators['performance']
@ -73,20 +73,20 @@ def ml_er_hpo():
    str_configspace = csj.write(cs)
    dict_configspace = json.loads(str_configspace)
    # 将超参数空间保存本地
-    with open(hpo_output_dir + "configspace.json", "w") as f:
+    with open(hpo_output_dir + r"\configspace.json", "w") as f:
        json.dump(dict_configspace, f, indent=4)
    scenario = Scenario(
        cs,
        crash_cost=1.0,
        deterministic=True,
-        n_trials=20,
+        n_trials=16,
        n_workers=1
    )
-    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
+    initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)
-    smac = HyperparameterOptimizationFacade(
+    smac = BlackBoxFacade(
        scenario,
        optimization.train,
        initial_design=initial_design,
@ -97,19 +97,21 @@ def ml_er_hpo():
    incumbent_cost = smac.validate(incumbent)
    default = cs.get_default_configuration()
    default_cost = smac.validate(default)
-    print(f"Default Cost: {default_cost}")
+    print(Fore.BLUE + f"Default Cost: {default_cost}")
-    print(f"Incumbent Cost: {incumbent_cost}")
+    print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")
    if incumbent_cost > default_cost:
        incumbent = default
-        print(f"Updated Incumbent Cost: {default_cost}")
+        print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')
-    print(f"Optimized Configuration:{incumbent.values()}")
+    print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")
-    with open(hpo_output_dir + "incumbent.json", "w") as f:
+    with open(hpo_output_dir + r"\incumbent.json", "w") as f:
        json.dump(dict(incumbent), f, indent=4)
    return incumbent
 if __name__ == '__main__':
    init(autoreset=True)
    print(Fore.CYAN + f'Start Time: {time.time()}')
    ml_er_hpo()
--- a/md_discovery/discovery_executor.py
+++ b/md_discovery/discovery_executor.py
@ -1,200 +0,0 @@
 import operator
 import numpy as np
 import pandas as pd
 import copy
 import torch
 from ConfigSpace import Configuration
 from tqdm import tqdm
 from settings import model, similarity_threshold, support_threshold, confidence_threshold
 def is_minimal(md, md_list, target_col):
    # 假设这个md是minimal
    if len(md_list) == 0:
        return True
    minimal = True
    for _ in md_list:
        if isinstance(_, tuple):
            _ = _[0]
        if _ != md:
            other_cols = list(set(_.keys()) - {target_col})
            # 假设列表中每一个md都使当前md不minimal
            exist = True
            # 如果左边任何一个大于，则假设不成立
            for col in other_cols:
                if _[col] > md[col]:
                    exist = False
                    break
            # 如果右边小于，假设也不成立
            if _[target_col] < md[target_col]:
                exist = False
            # 任何一次假设成立，当前md不minimal
            if exist:
                minimal = False
                break
    return minimal
 def pairs_inference(path, target_col):
    data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
    data.fillna("", inplace=True)
    data = data.astype(str)
    columns = data.columns.values.tolist()
    target_index = columns.index(target_col)
    cols_but_target = list(set(columns) - {target_col})
    length = data.shape[0]
    width = data.shape[1]
    sentences = []
    for col in range(0, width):
        for row in range(0, length):
            cell_value = data.values[row, col]
            sentences.append(cell_value)
    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
    split_embedding = torch.split(embedding, length, dim=0)
    table_tensor = torch.stack(split_embedding, dim=0, out=None)
    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
    sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
    # sim_tensor = torch.round(sim_tensor, decimals=3)
    # torch.save(sim_tensor, md_output_dir + "tensor.pt")
    md_list = []
    minimal_vio = []
    init_md = {}
    for col in columns:
        init_md[col] = 1 if col == target_col else -1
    md_list.append(init_md)
    for row1 in tqdm(range(0, length - 1)):
        terminate = False
        for row2 in range(row1 + 1, length):
            violated_mds = []
            # sims是两行的相似度
            sims = {}
            for col_index in range(0, width):
                col = columns[col_index]
                similarity = sim_tensor[col_index, row1, row2].item()
                sims[col] = similarity
            # 寻找violated md,从md列表中删除并加入vio列表
            for md in md_list[:]:
                lhs_satis = True
                rhs_satis = True
                for col in cols_but_target:
                    if sims[col] < md[col]:
                        lhs_satis = False
                        break
                if sims[target_col] < md[target_col]:
                    rhs_satis = False
                if lhs_satis == True and rhs_satis == False:
                    md_list.remove(md)
                    violated_mds.append(md)
            # for vio_md in violated_mds:
            #     # 特殊化左侧
            #     for col in cols_but_target:
            #         if sims[col] + 0.01 <= 1:
            #             spec_l_md = copy.deepcopy(vio_md)
            #             spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
            #             if is_minimal(spec_l_md, md_list, target_col):
            #                 md_list.append(spec_l_md)
            #     if vio_md not in minimal_vio:
            #         minimal_vio.append(vio_md)
            for vio_md in violated_mds:
                vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
                if vio_md_support >= support_threshold:
                    for col in cols_but_target:
                        if sims[col] < 1.0:
                            spec_l_md = copy.deepcopy(vio_md)
                            if sims[col] < similarity_threshold:
                                spec_l_md[col] = similarity_threshold
                            else:
                                if sims[col] + 0.01 <= 1.0:
                                    spec_l_md[col] = sims[col] + 0.01
                                else:
                                    spec_l_md[col] = 1.0
                            if is_minimal(spec_l_md, md_list, target_col):
                                md_list.append(spec_l_md)
                    if vio_md not in minimal_vio:
                        minimal_vio.append(vio_md)
            if len(md_list) == 0:
                terminate = True
                break
        if terminate:
            break
    if len(minimal_vio) > 0:
        for md in minimal_vio[:]:
            support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
            if support >= support_threshold and confidence >= confidence_threshold:
                minimal_vio.append((md, support, confidence))
            minimal_vio.remove(md)
    if len(md_list) > 0:
        # 去除重复MD
        tmp = []
        for _ in md_list:
            if _ not in tmp:
                tmp.append(_)
        md_list = tmp
        # 去除support小于阈值MD
        for _ in md_list[:]:
            support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
            if support >= support_threshold and confidence >= confidence_threshold:
                md_list.append((_, support, confidence))
            md_list.remove(_)
        # 去除不minimal的MD
        for md_tuple in md_list[:]:
            if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5:
                md_list.remove(md_tuple)
        if len(minimal_vio) > 0:
            for vio_tuple in minimal_vio[:]:
                if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5:
                    minimal_vio.remove(vio_tuple)
    if len(minimal_vio) > 0:
        for vio_tuple in minimal_vio[:]:
            if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5:
                minimal_vio.remove(vio_tuple)
    result = []
    result.extend(md_list)
    result.extend(minimal_vio)
    result.sort(key=operator.itemgetter(2), reverse=True)
    print(f'\033[33mList Length: {len(result)}\033[0m')
    return result
 def get_metrics(current_md, data, sim_tensor, target_col, target_index):
    columns = data.columns.values.tolist()
    length = data.shape[0]
    width = data.shape[1]
    md_tensor = list(current_md.values())
    md_tensor = torch.tensor(md_tensor, device='cuda')
    md_tensor_2d = md_tensor.unsqueeze(1)
    md_tensor_3d = md_tensor_2d.unsqueeze(2)
    md_tensor_3d = md_tensor_3d.repeat(1, length, length)
    sim_tensor = torch.round(sim_tensor, decimals=4)
    sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
    ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
    for i in range(0, width):
        if i != target_index:
            sup_tensor_slice = sup_tensor[i]
            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
    sup_tensor_int = ini_slice.int()
    support_Naumann = torch.count_nonzero(sup_tensor_int).item()
    support_Naumann = (support_Naumann - length) / 2
    ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
    conf_tensor_int = ini_slice.int()
    support_Fan = torch.count_nonzero(conf_tensor_int).item()
    support_Fan = (support_Fan - length) / 2
    confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
    return support_Fan, confidence
--- a/md_discovery/discovery_executor_gpu.py
+++ b/md_discovery/discovery_executor_gpu.py
@ -1,138 +0,0 @@
 import math
 import operator
 import random
 import time
 from tqdm import tqdm
 import numpy as np
 import pandas as pd
 import torch
 from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
 sample_number = 100000
 step_length = 0.01
 def get_metrics(md_tensor, data, sim_tensor, target_index):
    length = data.shape[0]
    width = data.shape[1]
    # md_tensor = list(current_md.values())
    # md_tensor = torch.tensor(md_tensor, device='cuda')
    md_tensor_2d = md_tensor.unsqueeze(1)
    md_tensor_3d = md_tensor_2d.unsqueeze(2)
    md_tensor_3d = md_tensor_3d.repeat(1, length, length)
    sim_tensor = torch.round(sim_tensor, decimals=4)
    sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
    ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
    for i in range(0, width):
        if i != target_index:
            sup_tensor_slice = sup_tensor[i]
            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
    sup_tensor_int = ini_slice.int()
    support_Naumann = torch.count_nonzero(sup_tensor_int).item()
    support_Naumann = (support_Naumann - length) / 2
    conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
    conf_tensor_int = conf_tensor.int()
    support_Fan = torch.count_nonzero(conf_tensor_int).item()
    support_Fan = (support_Fan - length) / 2
    confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
    return support_Fan, confidence
 def build_cartesian(width, target_index):
    all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
                                   num=math.ceil((1-similarity_threshold)/step_length) + 1)
    all_values_array = np.round(all_values_array, 4)
    all_values_tensor = torch.tensor(all_values_array, device='cuda')
    all_values_tensor = all_values_tensor.float()
    all_values_tensor = torch.round(all_values_tensor, decimals=4)
    tensors_for_cartesian = []
    for i in range(0, width):
        if i == target_index:
            t = torch.tensor([1.0], device='cuda')
            tensors_for_cartesian.append(t)
        else:
            tensors_for_cartesian.append(all_values_tensor)
    result = torch.cartesian_prod(*tensors_for_cartesian)
    return result
 def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
    length = data.shape[0]
    width = data.shape[1]
    cartesian_product = cartesian_product.unsqueeze(2)
    cartesian_product = cartesian_product.unsqueeze(3)
    cartesian_product = cartesian_product.repeat(1, 1, length, length)
 def discover(path, target_col):
    data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
    data.fillna("", inplace=True)
    data = data.astype(str)
    columns = data.columns.values.tolist()
    target_index = columns.index(target_col)
    cols_but_target = list(set(columns) - {target_col})
    length = data.shape[0]
    width = data.shape[1]
    # 除了目标列外所有列的索引
    columns_indices = [_ for _ in range(0, width) if _ != target_index]
    sentences = []
    for col in range(0, width):
        for row in range(0, length):
            cell_value = data.values[row, col]
            sentences.append(cell_value)
    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
    split_embedding = torch.split(embedding, length, dim=0)
    table_tensor = torch.stack(split_embedding, dim=0, out=None)
    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
    sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
    sim_tensor = sim_tensor.float()
    sim_tensor = torch.round(sim_tensor, decimals=4)
    # 小于6列的可以尝试做笛卡尔积，大于6列可能指数爆炸
    if width < 6:
        # 列出除目标列以外所有列的所有取值，做笛卡尔积，结果为所有可能MD取值
        cartesian = build_cartesian(width, target_index)
        # 抽取sample_number / (width - 1)条MD，不含-1
        if cartesian.shape[0] > sample_number / (width - 1):
            index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
            cartesian = torch.index_select(cartesian, 0, index)
    else:
        # 随机生成sample_number / (width - 1)条MD，使用randint先转化为int再除成小数，不含-1
        cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
                                  (math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
        cartesian = cartesian / 100
        # 生成一列相似度为1的目标列，插入目标列所在位置
        ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
        cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
        cartesian = torch.round(cartesian, decimals=4)
    # 此tensor将与其他置为-1的tensor拼接
    joint_md_tensor = cartesian.clone()
    # 随机将1列，2列……置为-1
    for i in range(width - 2):
        index_list_format = []
        for j in range(cartesian.shape[0]):
            # 对每条MD，随机选择将要置为-1的列索引
            index_list_format.append(random.sample(columns_indices, i + 1))
        index = torch.tensor(index_list_format, device='cuda')
        # 随机调整为-1后的MD集合
        modified_cartesian = cartesian.scatter(1, index, -1)
        joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
    md_list = []
    # get_metric_tensor(cartesian, data, sim_tensor, target_index)
    for _ in tqdm(range(joint_md_tensor.shape[0])):
        s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
        if s >= support_threshold and c >= confidence_threshold:
            md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
            md_dict_format = {}
            for k in range(0, width):
                md_dict_format[columns[k]] = md_list_format[k]
            md_list.append((md_dict_format, s, c))
    md_list.sort(key=operator.itemgetter(2), reverse=True)
    return md_list
--- a/md_discovery/md_discover.py
+++ b/md_discovery/md_discover.py
@ -1,27 +0,0 @@
 from ConfigSpace import Configuration
 from md_discovery.discovery_executor import pairs_inference
 from md_discovery.discovery_executor_gpu import discover
 from settings import *
 # # 若不输出support和confidence，使用以下两块代码
 # # 将列表1写入本地，路径需自己修改
 # md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt'
 # with open(md_path, 'w') as f:
 #     for _ in mds:
 #         f.write(str(_) + '\n')
 #
 # # 将列表2写入本地，路径需自己修改
 # vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt'
 # with open(vio_path, 'w') as f:
 #     for _ in vio:
 #         f.write(str(_) + '\n')
 def md_discover(config: Configuration, source_path, target_path):
    mds_list = discover(source_path, target_attr)
    with open(target_path, 'w') as f:
        for _ in mds_list:
            f.write('Target:'+str(target_attr) + '\t')
            f.write(str(_))
            f.write('\n')
--- a/md_discovery/md_mining.py
+++ b/md_discovery/md_mining.py
@ -1,4 +1,5 @@
 import itertools
 import pickle
 import random
 import operator
 from operator import itemgetter
@ -16,10 +17,12 @@ from settings import *
 def mining(train: pd.DataFrame):
    # data is train set, in which each row represents a tuple pair
    train = train.astype(str)
    # 将label列移到最后
    train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1)
    # 尝试不将左右表key手动调整相同，而是只看gold属性是否为1
    # 故将左右表key直接去除
-    data = train.drop(columns=['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id], inplace=False)
+    data = train.drop(columns=['_id', 'ltable_id', 'rtable_id'], inplace=False)
    # data中现存属性：除key以外左右表属性和gold, 不含_id
    columns = data.columns.values.tolist()
    columns_without_prefix = [_.replace('ltable_', '') for _ in columns if _.startswith('ltable_')]
@ -62,7 +65,7 @@ def mining(train: pd.DataFrame):
    # 生成带标签的相似度张量
    sim_table_tensor_labeled = torch.cat((sim_table_tensor, label_tensor), 1)
    # 找到匹配元组对的行索引
-    mask = (data['gold'].isin(['1']))
+    mask = (data['label'].isin(['1']))
    match_pair_indices = data[mask].index.tolist()
    # 根据索引将匹配的行标签置为1
    sim_table_tensor_labeled[match_pair_indices, -1] = 1.00
@ -209,7 +212,7 @@ def build_candidate_md_matrix(sorted_unique_value_tensor_list_: list):
 def mds_to_txt(result_list_):
-    p = md_output_dir + "mds.txt"
+    p = md_output_dir + r"\mds.txt"
    with open(p, 'w') as f:
        for _ in result_list_:
            f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}')
@ -253,3 +256,10 @@ def merge_mds(md_list_):
            del same_sc_list[index]
    # 二级列表转一级列表
    return list(itertools.chain.from_iterable(grouped_md_tuples))
 if __name__ == '__main__':
    _train = pd.read_csv(directory_path + r'\train_whole.csv')
    result = mining(_train)
    with open(md_output_dir + r"\mds.pickle", "wb") as file_:
        pickle.dump(result, file_)
--- a/ml_er/magellan_er.py
+++ b/ml_er/magellan_er.py
@ -11,89 +11,20 @@ from ConfigSpace import Configuration
 from ConfigSpace.read_and_write import json as csj
 import py_entitymatching.catalog.catalog_manager as cm
 from tqdm import tqdm
 from colorama import Fore
 from md_discovery.md_mining import mining
 from settings import *
-def blocking_mining():
+def matching(config: Configuration):
-    start = time.time()
+    print(Fore.BLUE + f'Config: {config}')
-    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
+    with open(md_output_dir + r"\mds.pickle", "rb") as file:
-    cm.set_key(ltable, ltable_id)
+        md_list = pickle.load(file)
    rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
    cm.set_key(rtable, rtable_id)
    mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
    matching_number = len(mappings)
    # if ltable_id == rtable_id:
    #     tables_id = rtable_id
    attributes = ltable.columns.values.tolist()
    # lattributes = ['ltable_' + i for i in attributes]
    # rattributes = ['rtable_' + i for i in attributes]
    cm.set_key(ltable, ltable_id)
    cm.set_key(rtable, rtable_id)
-    blocker = em.OverlapBlocker()
+    train_set = pd.read_csv(directory_path + r'\train_whole.csv', encoding='ISO-8859-1')
-    candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
+    test_set = pd.read_csv(directory_path + r'\test_whole.csv', encoding='ISO-8859-1')
-                                     l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
+    ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1')
-                                     overlap_size=1, show_progress=False)
+    rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1')
    candidate['gold'] = 0
    candidate = candidate.reset_index(drop=True)
    block_time = time.time()
    print(f'Block Time: {block_time - start}')
    # 根据mapping表标注数据
    candidate_match_rows = []
    for t in tqdm(mappings.itertuples()):
        mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, mapping_lid)])) &
                (candidate['rtable_' + rtable_id].isin([getattr(t, mapping_rid)])))
        matching_indices = candidate[mask].index
        candidate_match_rows.extend(matching_indices.tolist())
    match_rows_mask = candidate.index.isin(candidate_match_rows)
    candidate.loc[match_rows_mask, 'gold'] = 1
    candidate.fillna(value="", inplace=True)
    # negative样本太多, 采样三倍于positive样本量
    candidate_mismatch = candidate[candidate['gold'] == 0]
    candidate_match = candidate[candidate['gold'] == 1]
    candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))
    candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
    # 如果拼接后不重设索引可能导致索引重复
    candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
    cm.set_key(candidate_for_train_test, '_id')
    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
    cm.set_ltable(candidate_for_train_test, ltable)
    cm.set_rtable(candidate_for_train_test, rtable)
    block_recall = len(candidate_match) / matching_number
    # 分为训练测试集
    train_proportion = 0.5
    sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
    train_set = sets['train']
    test_set = sets['test']
    label_and_split_time = time.time()
    print(f'Label and Split Time: {label_and_split_time - block_time}')
    # 挖掘MD并保存本地
    md_list = mining(train_set)
    mining_time = time.time()
    print(f'Mining Time: {mining_time - label_and_split_time}')
    blocking_results = (ltable, rtable, train_set, test_set, md_list, block_recall)
    # 将blocking结果保存到本地
    with open(er_output_dir + "blocking_result.pickle", "wb") as file_:
        pickle.dump(blocking_results, file_)
    return blocking_results
 def matching(config: Configuration, blocking_result_):
    print(f'\033[33mConfig: {config}\033[0m')
    start = time.time()
    ltable = blocking_result_[0]
    rtable = blocking_result_[1]
    train_set = blocking_result_[2]
    test_set = blocking_result_[3]
    md_list = blocking_result_[4]
    block_recall = blocking_result_[5]
    ml_matcher = config["ml_matcher"]
    match ml_matcher:
        case "dt":
@ -109,22 +40,22 @@ def matching(config: Configuration, blocking_result_):
                                   max_features=config['rf_max_features'])
    cm.set_key(train_set, '_id')
-    cm.set_fk_ltable(train_set, 'ltable_' + ltable_id)
+    cm.set_fk_ltable(train_set, 'ltable_id')
-    cm.set_fk_rtable(train_set, 'rtable_' + rtable_id)
+    cm.set_fk_rtable(train_set, 'rtable_id')
    cm.set_ltable(train_set, ltable)
    cm.set_rtable(train_set, rtable)
-    cm.set_key(ltable, ltable_id)
+    cm.set_key(ltable, 'id')
-    cm.set_key(rtable, rtable_id)
+    cm.set_key(rtable, 'id')
    cm.set_key(test_set, '_id')
-    cm.set_fk_ltable(test_set, 'ltable_' + ltable_id)
+    cm.set_fk_ltable(test_set, 'ltable_id')
-    cm.set_fk_rtable(test_set, 'rtable_' + rtable_id)
+    cm.set_fk_rtable(test_set, 'rtable_id')
    cm.set_ltable(test_set, ltable)
    cm.set_rtable(test_set, rtable)
    feature_table = em.get_features_for_matching(ltable, rtable, validate_inferred_attr_types=False)
    train_feature_vecs = em.extract_feature_vecs(train_set,
                                                 feature_table=feature_table,
-                                                 attrs_after=['gold'],
+                                                 attrs_after=['label'],
                                                 show_progress=False)
    train_feature_vecs.fillna(value=0, inplace=True)
@ -132,22 +63,21 @@ def matching(config: Configuration, blocking_result_):
    for _ in test_feature_after[:]:
        test_feature_after.append(_.replace('ltable_', 'rtable_'))
    for _ in test_feature_after:
-        if _.endswith(ltable_id) or _.endswith(rtable_id):
+        if _.endswith('id'):
            test_feature_after.remove(_)
-    test_feature_after.append('gold')
+    test_feature_after.append('label')
    test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
                                                attrs_after=test_feature_after, show_progress=False)
    test_feature_vecs.fillna(value=0, inplace=True)
-    fit_exclude = ['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id, 'gold']
+    fit_exclude = ['_id', 'ltable_id', 'rtable_id', 'label']
-    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
+    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='label')
-    test_feature_after.extend(['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id])
+    test_feature_after.extend(['_id', 'ltable_id', 'rtable_id'])
    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
                                  append=True, target_attr='predicted', inplace=False)
-    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
+    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    em.print_eval_summary(eval_result)
-    indicators = evaluate_prediction(predictions, 'gold', 'predicted')
+    indicators = evaluate_prediction(predictions, 'label', 'predicted')
    indicators['block_recall'] = block_recall
    test_feature_after.remove('_id')
    test_feature_after.append('predicted')
@ -158,31 +88,26 @@ def matching(config: Configuration, blocking_result_):
    # 目前predictions包含的属性：左右表全部属性+gold+predicted
    sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
    predictions['confidence'] = 0
    predictions['md'] = ''
    epl_match = 0  # 可解释，预测match
    if len(md_list) > 0:
        for row in tqdm(predictions.itertuples()):
-            x = is_explicable(row, md_list, sim_tensor_dict)
+            if str(getattr(row, 'predicted')) == str(1):
-            if x > 0 and str(getattr(row, 'predicted')) == str(1):
+                conf, md_dict = is_explicable(row, md_list, sim_tensor_dict)
-                predictions.loc[row[0], 'confidence'] = x
+                if conf > 0:
-                epl_match += 1
+                    predictions.loc[row[0], 'confidence'] = conf
                    predictions.loc[row[0], 'md'] = str(md_dict)
                    epl_match += 1
    df = predictions[predictions['predicted'] == str(1)]
    interpretability = epl_match / len(df)  # 可解释性
    indicators['interpretability'] = interpretability
    # note 既然不调block参数, 不妨假设block_recall很高, 不必考虑
    # if indicators["block_recall"] < indicators["recall"]:
    #     f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
    #                 indicators["precision"] + indicators["block_recall"])
    # else:
    #     f1 = indicators["F1"]
    performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"]
    indicators['performance'] = performance
-    print(f'ER Indicators: {indicators}')
+    print(Fore.BLUE + f'ER Indicators: {indicators}')
-    predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
+    predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True)
-    print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m')
+    print(Fore.CYAN + f'Finish Time: {time.time()}')
    return indicators
@ -223,7 +148,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
            col_tuple_list.append((left_index, right_index))
    length = predictions.shape[0]
-    width = predictions.shape[1]
+    # width = predictions.shape[1]
    predictions = predictions.reset_index(drop=True)
    sentences = predictions.values.flatten(order='F').tolist()
@ -238,7 +163,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
        rattr_tensor = norm_table_tensor[col_tuple[1]]
        mul_tensor = lattr_tensor * rattr_tensor
        sim_tensor = torch.sum(mul_tensor, 1)
-        sim_tensor = torch.round(sim_tensor, decimals=4)
+        sim_tensor = torch.round(sim_tensor, decimals=2)
        sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
    return sim_tensor_dict
@ -252,31 +177,28 @@ def is_explicable(row, all_mds: list, st_dict):
                explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
        if explicable:
-            return md_tuple[2]  # 任意一条md能解释，直接返回
+            return md_tuple[2], md_tuple[0]  # 任意一条md能解释，直接返回
-    return -1.0  # 遍历结束，不能解释
+    return -1.0, {}  # 遍历结束，不能解释
-def ml_er(config: Configuration, blocking_result_):
+def ml_er(config: Configuration):
-    indicators = matching(config, blocking_result_)
+    indicators = matching(config)
-    output_path = er_output_dir + "eval_result.txt"
+    output_path = er_output_dir + r"\eval_result.txt"
    with open(output_path, 'w') as _f:
        _f.write('Precision:' + str(indicators["precision"]) + '\n')
        _f.write('Recall:' + str(indicators["recall"]) + '\n')
        _f.write('F1:' + str(indicators["F1"]) + '\n')
        _f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
        _f.write('interpretability:' + str(indicators['interpretability']) + '\n')
        _f.write('performance:' + str(indicators['performance']) + '\n')
 if __name__ == '__main__':
-    if os.path.isfile(hpo_output_dir + "incumbent.json"):
+    if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
-        with open(hpo_output_dir + "configspace.json", 'r') as f:
+        with open(hpo_output_dir + r"\configspace.json", 'r') as f:
            dict_configspace = json.load(f)
        str_configspace = json.dumps(dict_configspace)
        configspace = csj.read(str_configspace)
-        with open(hpo_output_dir + "incumbent.json", 'r') as f:
+        with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
            dic = json.load(f)
        configuration = ConfigSpace.Configuration(configspace, values=dic)
-        with open(er_output_dir + "blocking_result.pickle", "rb") as file:
+        ml_er(configuration)
            blocking_result = pickle.load(file)
        ml_er(configuration, blocking_result)
--- a/ml_er/magellan_start.py
+++ b/ml_er/magellan_start.py
@ -1,4 +0,0 @@
 from ml_er.magellan_er import blocking_mining
 if __name__ == '__main__':
    blocking_mining()
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -1,363 +0,0 @@
 import json
 import os
 import sys
 import time
 import ConfigSpace
 import pandas
 import torch
 from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
 from ConfigSpace.read_and_write import json as csj
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
 import six
 from ConfigSpace import Configuration
 from tqdm import tqdm
 from md_discovery.md_discover import md_discover
 from settings import *
 def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"):
    df = train[train['gold'] == 1]
    # 元组对左右ID调整一致
    for index, row in df.iterrows():
        df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
    train_columns = train.columns.values.tolist()
    l_columns = []
    r_columns = []
    cols = []
    # 左表和右表字段名分别加入两个列表
    for _ in train_columns:
        if _.startswith('ltable'):
            l_columns.append(_)
        elif _.startswith('rtable'):
            r_columns.append(_)
    # 将左表中字段名去掉前缀，作为统一的字段名列表(前提是两张表内对应字段名调整一致)
    for _ in l_columns:
        cols.append(_.replace('ltable_', ''))
    ldf = df[l_columns]
    rdf = df[r_columns]
    ldf.columns = cols
    rdf.columns = cols
    t_single_tuple = pd.concat([ldf, rdf])
    t_single_tuple = t_single_tuple.reset_index(drop=True)
    t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
                        candidate: pd.DataFrame) -> dict:
    new_df = df.reset_index(drop=False, inplace=False)
    gold = new_df[labeled_attr]
    predicted = new_df[predicted_attr]
    gold_negative = gold[gold == 0].index.values
    gold_positive = gold[gold == 1].index.values
    predicted_negative = predicted[predicted == 0].index.values
    predicted_positive = predicted[predicted == 1].index.values
    false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
    true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
    false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
    num_true_positives = float(len(true_positive_indices))
    num_false_positives = float(len(false_positive_indices))
    num_false_negatives = float(len(false_negative_indices))
    precision_denominator = num_true_positives + num_false_positives
    recall_denominator = num_true_positives + num_false_negatives
    precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
    recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
    F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
    block_recall = len(candidate[candidate['gold'] == 1]) / matching_number
    return {"precision": precision, "recall": recall, "F1": F1, "block_recall": block_recall}
 def load_mds(paths: list) -> list:
    if len(paths) == 0:
        return []
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
        if not os.path.exists(md_path):
            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
            # 读取每一行的md，加入该文件的md列表
            for line in f.readlines():
                md_metadata = line.strip().split('\t')
                # todo 如果MD文件的形式改了 这里也要改
                md = eval(md_metadata[1])
                mds.append(md)
        all_mds.extend(mds)
    return all_mds
 def is_explicable(row, all_mds: list, st_dict):
    attrs = all_mds[0][0].keys()  # 从第一条md_tuple中的md字典中读取所有字段
    for md_tuple in all_mds:
        explicable = True  # 假设这条md能解释当前元组
        for a in attrs:
            if a != target_attr:
                if st_dict[a][row[0]].item() < md_tuple[0][a]:
                    explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
                    break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
        if explicable:
            return md_tuple[2]  # 任意一条md能解释，直接返回
    return -1.0  # 遍历结束，不能解释
 # 形成一个字典，key为字段名称，value为一维张量，记录了预测表中这一字段每行的左右属性的相似度
 def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
    predictions_attrs = predictions.columns.values.tolist()
    col_tuple_list = []
    for _ in predictions_attrs:
        if _.startswith('ltable'):
            left_index = predictions_attrs.index(_)
            right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
            col_tuple_list.append((left_index, right_index))
    length = predictions.shape[0]
    width = predictions.shape[1]
    sentences = []
    for col in range(0, width):
        for row in range(0, length):
            cell_value = predictions.values[row, col]
            sentences.append(cell_value)
    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
    split_embedding = torch.split(embedding, length, dim=0)
    table_tensor = torch.stack(split_embedding, dim=0, out=None)
    # prediction的归一化嵌入张量
    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
    sim_tensor_dict = {}
    for col_tuple in col_tuple_list:
        lattr_tensor = norm_table_tensor[col_tuple[0]]
        rattr_tensor = norm_table_tensor[col_tuple[1]]
        mul_tensor = lattr_tensor * rattr_tensor
        sim_tensor = torch.sum(mul_tensor, 1)
        sim_tensor = torch.round(sim_tensor, decimals=4)
        sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
    return sim_tensor_dict
 def er_process(config: Configuration):
    start = time.time()
    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
    cm.set_key(ltable, ltable_id)
    # ltable.fillna("", inplace=True)
    rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
    cm.set_key(rtable, rtable_id)
    # rtable.fillna("", inplace=True)
    mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
    # 仅保留两表中出现在映射表中的行，增大正样本比例
    lid_mapping_list = []
    rid_mapping_list = []
    # 全部转为字符串
    # ltable = ltable.astype(str)
    # rtable = rtable.astype(str)
    # mappings = mappings.astype(str)
    matching_number = len(mappings)  # 所有阳性样本数
    for index, row in mappings.iterrows():
        lid_mapping_list.append(row[mapping_lid])
        rid_mapping_list.append(row[mapping_rid])
    selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
    tables_id = rtable_id
    selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
    selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
    attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]
    attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]
    cm.set_key(selected_ltable, tables_id)
    cm.set_key(selected_rtable, tables_id)
    blocker = em.OverlapBlocker()
    candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
                                     config["block_attr"], allow_missing=True,
                                     l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
                                     overlap_size=1, show_progress=False)
    candidate['gold'] = 0
    candidate = candidate.reset_index(drop=True)
    candidate_match_rows = []
    for row in candidate.itertuples():
        l_id = getattr(row, 'ltable_' + tables_id)
        map_row = mappings[mappings[mapping_lid] == l_id]
        if map_row is not None:
            r_id = map_row[mapping_rid]
            for value in r_id:
                if value == getattr(row, 'rtable_' + tables_id):
                    candidate_match_rows.append(row[0])
        else:
            continue
    for _ in candidate_match_rows:
        candidate.loc[_, 'gold'] = 1
    candidate.fillna(value="", inplace=True)
    # 裁剪负样本，保持正负样本数量一致
    candidate_mismatch = candidate[candidate['gold'] == 0]
    candidate_match = candidate[candidate['gold'] == 1]
    if len(candidate_mismatch) > len(candidate_match):
        candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
    # 拼接正负样本
    candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
    # if len(candidate_for_train_test) == 0:
    #     return 0
    # 如果拼接后不重设索引可能导致索引重复
    candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
    cm.set_key(candidate_for_train_test, '_id')
    cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
    cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
    cm.set_ltable(candidate_for_train_test, selected_ltable)
    cm.set_rtable(candidate_for_train_test, selected_rtable)
    block_recall = len(candidate_match) / matching_number
    # 分为训练测试集
    train_proportion = 0.7
    sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
    train_set = sets['train']
    test_set = sets['test']
    # cm.set_key(train_set, '_id')
    # cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
    # cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
    # cm.set_ltable(train_set, selected_ltable)
    # cm.set_rtable(train_set, selected_rtable)
    #
    # cm.set_key(test_set, '_id')
    # cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
    # cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
    # cm.set_ltable(test_set, selected_ltable)
    # cm.set_rtable(test_set, selected_rtable)
    ml_matcher = config["ml_matcher"]
    if ml_matcher == "dt":
        matcher = em.DTMatcher(name='DecisionTree', random_state=0)
    elif ml_matcher == "svm":
        matcher = em.SVMMatcher(name='SVM', random_state=0)
    elif ml_matcher == "rf":
        matcher = em.RFMatcher(name='RF', random_state=0)
    elif ml_matcher == "lg":
        matcher = em.LogRegMatcher(name='LogReg', random_state=0)
    elif ml_matcher == "ln":
        matcher = em.LinRegMatcher(name='LinReg')
    elif ml_matcher == "nb":
        matcher = em.NBMatcher(name='NaiveBayes')
    feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
    train_feature_vecs = em.extract_feature_vecs(train_set,
                                                 feature_table=feature_table,
                                                 attrs_after=['gold'],
                                                 show_progress=False)
    train_feature_vecs.fillna(value=0, inplace=True)
    test_feature_after = attrs_with_l_prefix[:]
    test_feature_after.extend(attrs_with_r_prefix)
    for _ in test_feature_after:
        if _.endswith(tables_id):
            test_feature_after.remove(_)
    test_feature_after.append('gold')
    test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
                                                attrs_after=test_feature_after, show_progress=False)
    test_feature_vecs.fillna(value=0, inplace=True)
    fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
    test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
                                  append=True, target_attr='predicted', inplace=False)
    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
    em.print_eval_summary(eval_result)
    indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, candidate_for_train_test)
    # 计算可解释性
    ################################################################################################################
    predictions_attrs = []
    predictions_attrs.extend(attrs_with_l_prefix)
    predictions_attrs.extend(attrs_with_r_prefix)
    predictions_attrs.extend(['gold', 'predicted'])
    predictions = predictions[predictions_attrs]
    # 必须从训练集内挖MD
    train_attrs = predictions_attrs[:]
    train_attrs.remove('predicted')
    train_set = train_set[train_attrs]
    prepare_file_for_md_discovery(train_set)
    predictions = predictions.reset_index(drop=True)
    predictions = predictions.astype(str)
    sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
    predictions['confidence'] = 0
    md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt")
    md_paths = [md_output_dir + 'mds.txt']
    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
    epl_match = 0  # 可解释，预测match
    unexplainable = pd.DataFrame()
    if len(md_list) > 0:
        for row in tqdm(predictions.itertuples()):
            x = is_explicable(row, md_list, sim_tensor_dict)
            if x > 0 and str(getattr(row, 'predicted')) == str(1):
                predictions.loc[row[0], 'confidence'] = x
                epl_match += 1
        #     else:
        #         series = pd.Series(row)
        #         unexplainable = unexplainable._append(series, ignore_index=True)
        # unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True)
        # unexplainable.columns = predictions_attrs
        # unexplainable = unexplainable[train_attrs]
        # if len(unexplainable[unexplainable['gold'] == str(1)]) > 0:
        #     prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv')
        #     md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt")
    df = predictions[predictions['predicted'] == str(1)]
    interpretability = epl_match / len(df)  # 可解释性
    indicators['interpretability'] = interpretability
    if indicators["block_recall"] < indicators["recall"]:
        f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
                    indicators["precision"] + indicators["block_recall"])
    else:
        f1 = indicators["F1"]
    performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
    indicators['performance'] = performance
    indicators['eval_result'] = eval_result
    print(indicators)
    predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
    ################################################################################################################
    print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m')
    return indicators
 def ml_er(config: Configuration = None):
    indicators = er_process(config)
    output_path = er_output_dir + "eval_result.txt"
    with open(output_path, 'w') as f:
        for key, value in six.iteritems(_get_metric(indicators['eval_result'])):
            f.write(key + " : " + value)
            f.write('\n')
        f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
        f.write('interpretability:' + str(indicators['interpretability']) + '\n')
        f.write('performance:' + str(indicators['performance']) + '\n')
 if __name__ == '__main__':
    if os.path.isfile(hpo_output_dir + "incumbent.json"):
        with open(hpo_output_dir + "configspace.json", 'r') as f:
            dict_configspace = json.load(f)
        str_configspace = json.dumps(dict_configspace)
        configspace = csj.read(str_configspace)
        with open(hpo_output_dir + "incumbent.json", 'r') as f:
            dic = json.load(f)
        configuration = ConfigSpace.Configuration(configspace, values=dic)
        ml_er(configuration)
--- a/settings.py
+++ b/settings.py
@ -1,24 +1,12 @@
 from sentence_transformers import SentenceTransformer
-ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv'
+directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon'
 rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
 mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
 mapping_lid = 'idAbt'  # mapping表中左表id名
 mapping_rid = 'idBuy'  # mapping表中右表id名
 ltable_block_attr = 'name'
 rtable_block_attr = 'name'
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
 # lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
-model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
+er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\ml_er\output'
 md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\md_discovery\output'
 hpo_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\hpo\output'
 model = SentenceTransformer(r'E:\Data\Research\Models\all-MiniLM-L6-v2')
 interpre_weight = 0  # 可解释性权重
 similarity_threshold = 0.1
 support_threshold = 1
 confidence_threshold = 0.75
 er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
 md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
 hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'