From 253eb0835f645e8defee56d2728e10cc3a9a2c0a Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Sat, 3 Feb 2024 00:36:00 +0800
Subject: [PATCH] fuck

---
 .gitignore                  |   4 +
 README.md                   |   2 -
 draw/__init__.py            |   0
 hpo/__init__.py             |   0
 hpo/er_model_hpo.py         |  93 +++++++++
 md_discovery/__init__.py    |   0
 md_discovery/md_discover.py | 143 ++++++++++++++
 ml_er/__init__.py           |   0
 ml_er/ml_entity_resolver.py | 364 ++++++++++++++++++++++++++++++++++++
 ml_er/new.py                | 160 ++++++++++++++++
 settings.py                 |  22 +++
 11 files changed, 786 insertions(+), 2 deletions(-)
 create mode 100644 .gitignore
 delete mode 100644 README.md
 create mode 100644 draw/__init__.py
 create mode 100644 hpo/__init__.py
 create mode 100644 hpo/er_model_hpo.py
 create mode 100644 md_discovery/__init__.py
 create mode 100644 md_discovery/md_discover.py
 create mode 100644 ml_er/__init__.py
 create mode 100644 ml_er/ml_entity_resolver.py
 create mode 100644 ml_er/new.py
 create mode 100644 settings.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..241c07c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+/ml_er/output/*
+/md_discovery/output/*
+/hpo/output/*
+/datasets/*
\ No newline at end of file
diff --git a/README.md b/README.md
deleted file mode 100644
index 9cba231..0000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# matching_dependency_pyJedAI
-
diff --git a/draw/__init__.py b/draw/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/hpo/__init__.py b/hpo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py
new file mode 100644
index 0000000..13ee960
--- /dev/null
+++ b/hpo/er_model_hpo.py
@@ -0,0 +1,93 @@
+import pandas as pd
+import json
+from time import *
+from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
+from ConfigSpace.read_and_write import json as csj
+from smac import HyperparameterOptimizationFacade, Scenario
+from settings import *
+from ml_er.ml_entity_resolver import er_process
+from settings import ltable_path, ltable_id
+
+
+class Classifier:
+    @property
+    def configspace(self) -> ConfigurationSpace:
+        cs = ConfigurationSpace(seed=0)
+        ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|')
+        selected_attrs = ltable.columns.values.tolist()
+        block_attr_items = selected_attrs[:]
+        block_attr_items.remove(ltable_id)
+
+        jed_blocker = Categorical("jed_blocker",
+                                  ["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
+        block_attr = Categorical("block_attr", block_attr_items)
+        # filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8)
+        meta_blocker = Categorical("meta_blocker",
+                                   ["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
+        weighting_scheme = Categorical("weighting_scheme",
+                                       ['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',
+                                        'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])
+
+        # todo other hyperparameters
+        matching_metric = Categorical("matching_metric",
+                                      ['cosine', 'euclidean'])
+        matching_tokenizer = Categorical("matching_tokenizer",
+                                         ['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
+        matching_vectorizer = Categorical("matching_vectorizer",
+                                          ['tfidf', 'tf', 'boolean'])
+        clusteror = Categorical("clusteror_name",
+                                ["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
+
+        cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
+                                matching_tokenizer, matching_vectorizer, clusteror])
+        return cs
+
+    def train(self, config: Configuration, seed: int = 0) -> float:
+        indicators = er_process(config)
+        return 1-indicators['performance']
+
+
+def ml_er_hpo():
+    classifier = Classifier()
+    cs = classifier.configspace
+    str_configspace = csj.write(cs)
+    dict_configspace = json.loads(str_configspace)
+    with open(hpo_output_dir + r"\configspace.json", "w") as f:
+        json.dump(dict_configspace, f, indent=4)
+
+    scenario = Scenario(
+        cs,
+        deterministic=True,
+        n_trials=50,  # We want to run max 50 trials (combination of config and seed)
+        n_workers=1
+    )
+
+    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
+
+    smac = HyperparameterOptimizationFacade(
+        scenario,
+        classifier.train,
+        initial_design=initial_design,
+        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
+    )
+
+    incumbent = smac.optimize()
+    incumbent_cost = smac.validate(incumbent)
+    default = cs.get_default_configuration()
+    default_cost = smac.validate(default)
+    print(f"Default Cost: {default_cost}")
+    print(f"Incumbent Cost: {incumbent_cost}")
+
+    if incumbent_cost > default_cost:
+        incumbent = default
+        print(f"Updated Incumbent Cost: {default_cost}")
+
+    print(f"Optimized Configuration:{incumbent.values()}")
+
+    with open(hpo_output_dir + r"\incumbent.json", "w") as f:
+        json.dump(dict(incumbent), f, indent=4)
+    return incumbent
+
+
+if __name__ == '__main__':
+    ml_er_hpo()
diff --git a/md_discovery/__init__.py b/md_discovery/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py
new file mode 100644
index 0000000..ee3fe69
--- /dev/null
+++ b/md_discovery/md_discover.py
@@ -0,0 +1,143 @@
+import math
+import operator
+
+import numpy as np
+import pandas as pd
+import torch
+from ConfigSpace import Configuration
+from settings import *
+import random
+from tqdm import tqdm
+
+
+sample_number = 100000
+step_length = 0.01
+
+
+def md_discover(config: Configuration, source_path, target_path):
+    mds_list = discover(source_path, target_attr)
+    if len(mds_list) > 0:
+        with open(target_path, 'w') as f:
+            for _ in mds_list:
+                f.write('Target:'+str(target_attr) + '\t')
+                f.write(str(_))
+                f.write('\n')
+
+
+def discover(path, target_col):
+    data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
+    data.fillna("", inplace=True)
+    data = data.astype(str)
+    columns = data.columns.values.tolist()
+    target_index = columns.index(target_col)
+    cols_but_target = list(set(columns) - {target_col})
+    length = data.shape[0]
+    width = data.shape[1]
+    # 除了目标列外所有列的索引
+    columns_indices = [_ for _ in range(0, width) if _ != target_index]
+
+    sentences = []
+    for col in range(0, width):
+        for row in range(0, length):
+            cell_value = data.values[row, col]
+            sentences.append(cell_value)
+    if len(sentences) == 0:
+        return []
+    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
+    split_embedding = torch.split(embedding, length, dim=0)
+    table_tensor = torch.stack(split_embedding, dim=0, out=None)
+    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
+    sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
+    sim_tensor = sim_tensor.float()
+    sim_tensor = torch.round(sim_tensor, decimals=4)
+
+    # 小于6列的可以尝试做笛卡尔积，大于6列可能指数爆炸
+    if width < 6:
+        # 列出除目标列以外所有列的所有取值，做笛卡尔积，结果为所有可能MD取值
+        cartesian = build_cartesian(width, target_index)
+        # 抽取sample_number / (width - 1)条MD，不含-1
+        if cartesian.shape[0] > sample_number / (width - 1):
+            index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
+            cartesian = torch.index_select(cartesian, 0, index)
+    else:
+        # 随机生成sample_number / (width - 1)条MD，使用randint先转化为int再除成小数，不含-1
+        cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
+                                  (math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
+        cartesian = cartesian / 100
+        # 生成一列相似度为1的目标列，插入目标列所在位置
+        ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
+        cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
+        cartesian = torch.round(cartesian, decimals=4)
+    # 此tensor将与其他置为-1的tensor拼接
+    joint_md_tensor = cartesian.clone()
+    # 随机将1列，2列……置为-1
+    for i in range(width - 2):
+        index_list_format = []
+        for j in range(cartesian.shape[0]):
+            # 对每条MD，随机选择将要置为-1的列索引
+            index_list_format.append(random.sample(columns_indices, i + 1))
+        index = torch.tensor(index_list_format, device='cuda')
+        # 随机调整为-1后的MD集合
+        modified_cartesian = cartesian.scatter(1, index, -1)
+        joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
+
+    md_list = []
+    # get_metric_tensor(cartesian, data, sim_tensor, target_index)
+    for _ in tqdm(range(joint_md_tensor.shape[0])):
+        s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
+        if s >= support_threshold and c >= confidence_threshold:
+            md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
+            md_dict_format = {}
+            for k in range(0, width):
+                md_dict_format[columns[k]] = md_list_format[k]
+            md_list.append((md_dict_format, s, c))
+    md_list.sort(key=operator.itemgetter(2), reverse=True)
+    return md_list
+
+def get_metrics(md_tensor, data, sim_tensor, target_index):
+    length = data.shape[0]
+    width = data.shape[1]
+
+    # md_tensor = list(current_md.values())
+    # md_tensor = torch.tensor(md_tensor, device='cuda')
+    md_tensor_2d = md_tensor.unsqueeze(1)
+    md_tensor_3d = md_tensor_2d.unsqueeze(2)
+    md_tensor_3d = md_tensor_3d.repeat(1, length, length)
+
+    sim_tensor = torch.round(sim_tensor, decimals=4)
+
+    sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
+    ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
+    for i in range(0, width):
+        if i != target_index:
+            sup_tensor_slice = sup_tensor[i]
+            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
+    sup_tensor_int = ini_slice.int()
+    support_Naumann = torch.count_nonzero(sup_tensor_int).item()
+    support_Naumann = (support_Naumann - length) / 2
+
+    conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
+    conf_tensor_int = conf_tensor.int()
+    support_Fan = torch.count_nonzero(conf_tensor_int).item()
+    support_Fan = (support_Fan - length) / 2
+    confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
+
+    return support_Fan, confidence
+
+
+def build_cartesian(width, target_index):
+    all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
+                                   num=math.ceil((1-similarity_threshold)/step_length) + 1)
+    all_values_array = np.round(all_values_array, 4)
+    all_values_tensor = torch.tensor(all_values_array, device='cuda')
+    all_values_tensor = all_values_tensor.float()
+    all_values_tensor = torch.round(all_values_tensor, decimals=4)
+    tensors_for_cartesian = []
+    for i in range(0, width):
+        if i == target_index:
+            t = torch.tensor([1.0], device='cuda')
+            tensors_for_cartesian.append(t)
+        else:
+            tensors_for_cartesian.append(all_values_tensor)
+    result = torch.cartesian_prod(*tensors_for_cartesian)
+    return result
diff --git a/ml_er/__init__.py b/ml_er/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py
new file mode 100644
index 0000000..a109e22
--- /dev/null
+++ b/ml_er/ml_entity_resolver.py
@@ -0,0 +1,364 @@
+import json
+import os
+from time import time
+
+import ConfigSpace
+import pandas as pd
+import torch
+from ConfigSpace import Configuration
+from ConfigSpace.read_and_write import json as csj
+from pyjedai.clustering import UniqueMappingClustering
+from tqdm import tqdm
+
+from md_discovery.md_discover import md_discover
+from settings import *
+from pyjedai.datamodel import Data
+from pyjedai.block_cleaning import BlockPurging
+from pyjedai.block_cleaning import BlockFiltering
+from pyjedai.matching import EntityMatching
+from pyjedai.block_building import (
+    StandardBlocking,
+    QGramsBlocking,
+    ExtendedQGramsBlocking,
+    SuffixArraysBlocking,
+    ExtendedSuffixArraysBlocking,
+)
+from pyjedai.comparison_cleaning import (
+    WeightedEdgePruning,
+    WeightedNodePruning,
+    CardinalityEdgePruning,
+    CardinalityNodePruning,
+    BLAST,
+    ReciprocalCardinalityNodePruning,
+    ReciprocalWeightedNodePruning,
+    ComparisonPropagation
+)
+from pyjedai.clustering import *
+
+
+def prepare_file_for_md_discovery(train: pd.DataFrame, t_single_tuple_path=er_output_dir + r"\t_single_tuple.csv"):
+    # 挑选出实际匹配的元组对
+    df = train[train['gold'] == '1']
+    # 将左表id赋值给右表
+    for index, row in df.iterrows():
+        df.loc[index, "rtable_" + rtable_id] = row["ltable_" + ltable_id]
+
+    train_columns = train.columns.values.tolist()
+    l_columns = []
+    r_columns = []
+    cols = []
+    # 左表和右表字段名分别加入两个列表
+    for _ in train_columns:
+        if _.startswith('ltable'):
+            l_columns.append(_)
+        elif _.startswith('rtable'):
+            r_columns.append(_)
+    # 将左表中字段名去掉前缀，作为统一的字段名列表(前提是两张表内对应字段名调整一致)
+    for _ in l_columns:
+        cols.append(_.replace('ltable_', ''))
+
+    ldf = df[l_columns]
+    rdf = df[r_columns]
+    ldf.columns = cols
+    rdf.columns = cols
+    t_single_tuple = pd.concat([ldf, rdf])
+    t_single_tuple = t_single_tuple.reset_index(drop=True)
+
+    t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
+
+
+# 形成一个字典，key为字段名称，value为一维张量，记录了预测表中这一字段每行的左右属性的相似度
+def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
+    predictions_attrs = predictions.columns.values.tolist()
+    col_tuple_list = []
+    for _ in predictions_attrs:
+        if _.startswith('ltable'):
+            left_index = predictions_attrs.index(_)
+            right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
+            col_tuple_list.append((left_index, right_index))
+
+    df = predictions.drop(columns=['gold', 'predicted', 'confidence'], inplace=False)
+    length = df.shape[0]
+    width = df.shape[1]
+    sentences = []
+    for col in range(0, width):
+        for row in range(0, length):
+            cell_value = df.values[row, col]
+            sentences.append(cell_value)
+    if len(sentences) == 0:
+        return {}
+    embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
+    split_embedding = torch.split(embedding, length, dim=0)
+    table_tensor = torch.stack(split_embedding, dim=0, out=None)
+    # prediction的归一化嵌入张量
+    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
+    sim_tensor_dict = {}
+    for col_tuple in col_tuple_list:
+        lattr_tensor = norm_table_tensor[col_tuple[0]]
+        rattr_tensor = norm_table_tensor[col_tuple[1]]
+        mul_tensor = lattr_tensor * rattr_tensor
+        sim_tensor = torch.sum(mul_tensor, 1)
+        sim_tensor = torch.round(sim_tensor, decimals=4)
+        sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
+    return sim_tensor_dict
+
+
+def load_mds(paths: list) -> list:
+    if len(paths) == 0:
+        return []
+    all_mds = []
+    # 传入md路径列表
+    for md_path in paths:
+        if not os.path.exists(md_path):
+            continue
+        mds = []
+        # 打开每一个md文件
+        with open(md_path, 'r') as f_:
+            # 读取每一行的md，加入该文件的md列表
+            for line in f_.readlines():
+                md_metadata = line.strip().split('\t')
+                # todo 如果MD文件的形式改了 这里也要改
+                md = eval(md_metadata[1])
+                mds.append(md)
+        all_mds.extend(mds)
+    return all_mds
+
+
+def is_explicable(row, all_mds: list, st_dict):
+    attrs = all_mds[0][0].keys()  # 从第一条md_tuple中的md字典中读取所有字段
+    for md_tuple in all_mds:
+        explicable = True  # 假设这条md能解释当前元组
+        for a in attrs:
+            if a != target_attr:
+                if st_dict[a][row[0]].item() < md_tuple[0][a]:
+                    explicable = False  # 任意一个字段的相似度达不到阈值，这条md就不能解释当前元组
+                    break  # 不再与当前md的其他相似度阈值比较，跳转到下一条md
+        if explicable:
+            return md_tuple[2]  # 任意一条md能解释，直接返回
+    return -1.0  # 遍历结束，不能解释
+
+
+def er_process(config: Configuration):
+    print(f'\033[33mConfig: {config}\033[0m')
+    start = time()
+    ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False)
+    rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False)
+    mapping = pd.read_csv(mapping_path, sep='|', engine='python')
+    data = Data(dataset_1=ltable,
+                id_column_name_1=ltable_id,
+                dataset_2=rtable,
+                id_column_name_2=rtable_id,
+                ground_truth=mapping)
+
+    # clean data(optional)
+    data.clean_dataset(remove_stopwords=False,
+                       remove_punctuation=False,
+                       remove_numbers=False,
+                       remove_unicodes=False)
+
+    # block building
+    blocker_name = config["jed_blocker"]
+    block_attribute = config["block_attr"]
+    match blocker_name:
+        case "Standard":
+            blocker = StandardBlocking()
+        case "QGrams":
+            blocker = QGramsBlocking()
+        case "ExtendedQG":
+            blocker = ExtendedQGramsBlocking()
+        case "SuffixArrays":
+            blocker = SuffixArraysBlocking()
+        case "ExtendedSA":
+            blocker = ExtendedSuffixArraysBlocking()
+    blocks = blocker.build_blocks(data, attributes_1=[block_attribute], attributes_2=[block_attribute])
+
+    # block purging(optional)
+    bp = BlockPurging()
+    cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
+
+    # block cleaning(optional)
+    bf = BlockFiltering(ratio=0.8)  # todo what is ratio for?
+    filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
+
+    # Comparison Cleaning - Meta Blocking(optional)
+    pruning_method = config["meta_blocker"]
+    weighting_scheme_name = config["weighting_scheme"]
+    match pruning_method:
+        case "WEP":
+            meta_blocker = WeightedEdgePruning(weighting_scheme=weighting_scheme_name)
+        case "WNP":
+            meta_blocker = WeightedNodePruning(weighting_scheme=weighting_scheme_name)
+        case "CEP":
+            meta_blocker = CardinalityEdgePruning(weighting_scheme=weighting_scheme_name)
+        case "CNP":
+            meta_blocker = CardinalityNodePruning(weighting_scheme=weighting_scheme_name)
+        case "BLAST":
+            meta_blocker = BLAST(weighting_scheme=weighting_scheme_name)
+        case "RCNP":
+            meta_blocker = ReciprocalCardinalityNodePruning(weighting_scheme=weighting_scheme_name)
+        case "RWNP":
+            meta_blocker = ReciprocalWeightedNodePruning(weighting_scheme=weighting_scheme_name)
+        case "CP":
+            meta_blocker = ComparisonPropagation()
+    candidate_pairs_blocks = meta_blocker.process(blocks=filtered_blocks, data=data, tqdm_disable=True)
+
+    # entity matching
+    em = EntityMatching(
+        metric=config["matching_metric"],
+        tokenizer=config["matching_tokenizer"],
+        vectorizer=config["matching_vectorizer"],
+        qgram=3,
+        similarity_threshold=0.0
+    )
+
+    pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
+    # draw(pairs_graph)
+
+    # entity clustering
+    clusteror_name = config["clusteror_name"]
+    match clusteror_name:
+        case "CCC":
+            clusteror = ConnectedComponentsClustering()
+        case "UMC":
+            clusteror = UniqueMappingClustering()
+        case "EC":
+            clusteror = ExactClustering()
+        case "CenterC":
+            clusteror = CenterClustering()
+        case "BMC":
+            clusteror = BestMatchClustering()
+        case "MCC":
+            clusteror = MergeCenterClustering()
+        case "CC":
+            clusteror = CorrelationClustering()
+        case "CTC":
+            clusteror = CutClustering()
+        case "MCL":
+            clusteror = MarkovClustering()
+        case "KMAC":
+            clusteror = KiralyMSMApproximateClustering()
+        case "RSRC":
+            clusteror = RicochetSRClustering()
+    # 得到预测结果与评估指标
+    clusters = clusteror.process(pairs_graph, data, similarity_threshold=0.17)
+    matches_dataframe = clusteror.export_to_df(clusters)
+    matches_dataframe_path = er_output_dir + r'\matches_dataframe.csv'
+    matches_dataframe.to_csv(matches_dataframe_path, sep=',', index=False, header=True, quoting=1)
+    evaluation = clusteror.evaluate(clusters)
+    # evaluation.pop('True Positives')
+    # evaluation.pop('False Positives')
+    # evaluation.pop('True Negatives')
+    # evaluation.pop('False Negatives')
+
+
+    ltable = ltable.astype(str)
+    rtable = rtable.astype(str)
+    mapping = mapping.astype(str)
+    result_df = matches_dataframe.astype(str)
+    lcolumns_dict = {}
+    rcolumns_dict = {}
+    ltable_attrs = ltable.columns.values.tolist()
+    rtable_attrs = rtable.columns.values.tolist()
+    for _ in ltable_attrs:
+        lcolumns_dict[_] = 'ltable_' + _
+    for _ in rtable_attrs:
+        rcolumns_dict[_] = 'rtable_' + _
+    result_lid_list = result_df['id1'].tolist()
+    selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)]
+    selected_ltable = selected_ltable.rename(columns=lcolumns_dict)
+    selected_ltable['key'] = 1
+    result_rid_list = result_df['id2'].tolist()
+    selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)]
+    selected_rtable = selected_rtable.rename(columns=rcolumns_dict)
+    selected_rtable['key'] = 1
+    predictions = pd.merge(selected_ltable, selected_rtable, on='key')
+    predictions.drop(columns='key', inplace=True)
+    predictions = predictions.reset_index(drop=True)
+
+    predictions['gold'] = '0'
+    predictions['predicted'] = '0'
+    gold_match_rows = []
+    predicted_match_rows = []
+    for tuple_ in tqdm(predictions.itertuples()):
+        lid = getattr(tuple_, 'ltable_' + ltable_id)
+        map_row = mapping[mapping[mapping_lid] == lid]
+        result_row = result_df[result_df['id1'] == lid]
+        if map_row is not None:
+            rid = map_row[mapping_rid]
+            for value in rid:
+                if value == getattr(tuple_, 'rtable_' + rtable_id):
+                    gold_match_rows.append(tuple_[0])
+        if result_row is not None:
+            rid = result_row['id2']
+            for value in rid:
+                if value == getattr(tuple_, 'rtable_' + rtable_id):
+                    predicted_match_rows.append(tuple_[0])
+    for _ in gold_match_rows:
+        predictions.loc[_, 'gold'] = '1'
+    for _ in predicted_match_rows:
+        predictions.loc[_, 'predicted'] = '1'
+
+    # 挖MD 计算可解释性
+    prepare_file_for_md_discovery(predictions)
+    predictions['confidence'] = 0
+    predicted_match = predictions[predictions['predicted'] == '1']
+    predicted_match = predicted_match.reset_index(drop=True)
+    sim_tensor_dict = build_col_pairs_sim_tensor_dict(predicted_match)
+
+    md_discover(config, er_output_dir + r"\t_single_tuple.csv", md_output_dir + r"\mds.txt")
+    md_paths = [md_output_dir + r'\mds.txt']
+    md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
+    epl_match = 0  # 可解释，预测match
+
+    if sim_tensor_dict:
+        if len(md_list) > 0:
+            for row in tqdm(predicted_match.itertuples()):
+                x = is_explicable(row, md_list, sim_tensor_dict)
+                if x > 0:
+                    predicted_match.loc[row[0], 'confidence'] = x
+                    epl_match += 1
+        interpretability = epl_match / len(predicted_match)  # 可解释性
+        evaluation['interpretability'] = interpretability
+    else:
+        interpretability = 0
+        evaluation['interpretability'] = interpretability
+    f1 = evaluation['F1 %'] / 100
+    performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
+    evaluation['performance'] = performance
+    print(f'\033[33mEvaluation: {evaluation}\033[0m')
+    predicted_match.to_csv(er_output_dir + r'\predicted_match.csv', sep=',', index=False, header=True)
+    ################################################################################################################
+    print(f'\033[33mTime consumed by ML-ER in seconds: {time() - start}\033[0m')
+
+    return evaluation
+
+
+def ml_er(config: Configuration = None):
+    indicators = er_process(config)
+    output_path = er_output_dir + r"\eval_result.txt"
+    with open(output_path, 'w') as f_:
+        for key, value in indicators:
+            f_.write(key + " : " + value)
+            f_.write('\n')
+
+
+# if __name__ == '__main__':
+#     if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
+#         with open(hpo_output_dir + r"\configspace.json", 'r') as f:
+#             dict_configspace = json.load(f)
+#         str_configspace = json.dumps(dict_configspace)
+#         configspace = csj.read(str_configspace)
+#         with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
+#             dic = json.load(f)
+#         configuration = ConfigSpace.Configuration(configspace, values=dic)
+#         ml_er(configuration)
+
+if __name__ == '__main__':
+    with open(hpo_output_dir + r"\configspace.json", 'r') as f:
+        dict_configspace = json.load(f)
+    str_configspace = json.dumps(dict_configspace)
+    configspace = csj.read(str_configspace)
+    with open(hpo_output_dir + r"\fuck.json", 'r') as f:
+        dic = json.load(f)
+    configuration = ConfigSpace.Configuration(configspace, values=dic)
+    ml_er(configuration)
diff --git a/ml_er/new.py b/ml_er/new.py
new file mode 100644
index 0000000..17e4d89
--- /dev/null
+++ b/ml_er/new.py
@@ -0,0 +1,160 @@
+import os
+import sys
+import pandas as pd
+import networkx
+from networkx import draw, Graph
+import pyjedai
+from pyjedai.utils import (
+    text_cleaning_method,
+    print_clusters,
+    print_blocks,
+    print_candidate_pairs
+)
+from pyjedai.block_building import (
+    StandardBlocking,
+    QGramsBlocking,
+    ExtendedQGramsBlocking,
+    SuffixArraysBlocking,
+    ExtendedSuffixArraysBlocking,
+)
+from pyjedai.comparison_cleaning import (
+    WeightedEdgePruning,
+    WeightedNodePruning,
+    CardinalityEdgePruning,
+    CardinalityNodePruning,
+    BLAST,
+    ReciprocalCardinalityNodePruning,
+    ReciprocalWeightedNodePruning,
+    ComparisonPropagation
+)
+from pyjedai.evaluation import Evaluation
+from pyjedai.datamodel import Data
+from pyjedai.block_cleaning import BlockPurging
+from pyjedai.block_cleaning import BlockFiltering
+from pyjedai.matching import EntityMatching
+from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
+
+from settings import er_output_dir, ltable_path, rtable_path, ltable_id, rtable_id, mapping_lid, mapping_rid, \
+    mapping_path
+
+
+def example():
+    # read data
+    d1 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv",
+                     sep='|', engine='python', na_filter=False)
+    d2 = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv",
+                     sep='|', engine='python', na_filter=False)
+    gt = pd.read_csv(r"E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv",
+                     sep='|', engine='python')
+
+    data = Data(dataset_1=d1,
+                id_column_name_1='id',
+                dataset_2=d2,
+                id_column_name_2='id',
+                ground_truth=gt)
+
+    # clean data(optional)
+    data.clean_dataset(remove_stopwords=False,
+                       remove_punctuation=False,
+                       remove_numbers=False,
+                       remove_unicodes=False)
+
+    # block building
+    bb = StandardBlocking()
+    blocks = bb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])
+
+    # block purging(optional)
+    bp = BlockPurging()
+    cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
+
+    # block cleaning(optional)
+    # todo ratio
+    bf = BlockFiltering(ratio=0.8)
+    filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
+
+    # Comparison Cleaning - Meta Blocking(optional)
+    # todo meta_blocking methods, weighting_scheme (more)
+    mb = WeightedEdgePruning(weighting_scheme='EJS')
+    candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=True)
+
+    # entity matching
+    # todo parameters(qgram, similarity_threshold)
+    em = EntityMatching(
+        metric='cosine',
+        tokenizer='char_tokenizer',
+        vectorizer='tfidf',
+        qgram=3,
+        similarity_threshold=0.0
+    )
+
+    # 无向权重图可视化
+    pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
+    draw(pairs_graph)
+
+    # entity clustering
+    # todo similarity_threshold
+    ccc = UniqueMappingClustering()
+    clusters = ccc.process(pairs_graph, data, similarity_threshold=0.17)
+    result_df = ccc.export_to_df(clusters)
+    p = er_output_dir + r'\result.csv'
+    result_df.to_csv(p, sep=',', index=False, header=True, quoting=1)
+    _ = ccc.evaluate(clusters)
+    return result_df
+
+
+if __name__ == '__main__':
+    rdf = example()
+    rdf = rdf.astype(str)
+    ltable = pd.read_csv(ltable_path, sep='|', engine='python', na_filter=False)
+    rtable = pd.read_csv(rtable_path, sep='|', engine='python', na_filter=False)
+    mapping = pd.read_csv(mapping_path, sep='|', engine='python')
+    ltable = ltable.astype(str)
+    rtable = rtable.astype(str)
+    mapping = mapping.astype(str)
+
+    lcolumns_dict = {}
+    rcolumns_dict = {}
+    ltable_attrs = ltable.columns.values.tolist()
+    rtable_attrs = rtable.columns.values.tolist()
+    for _ in ltable_attrs:
+        lcolumns_dict[_] = 'ltable_' + _
+    for _ in rtable_attrs:
+        rcolumns_dict[_] = 'rtable_' + _
+    result_lid_list = rdf['id1'].tolist()
+    selected_ltable = ltable[ltable[ltable_id].isin(result_lid_list)]
+    selected_ltable = selected_ltable.rename(columns=lcolumns_dict)
+    selected_ltable['key'] = 1
+    result_rid_list = rdf['id2'].tolist()
+    selected_rtable = rtable[rtable[rtable_id].isin(result_rid_list)]
+    selected_rtable = selected_rtable.rename(columns=rcolumns_dict)
+    selected_rtable['key'] = 1
+    predictions = pd.merge(selected_ltable, selected_rtable, on='key')
+    predictions.drop(columns='key', inplace=True)
+    predictions = predictions.reset_index(drop=True)
+
+    predictions['gold'] = '0'
+    predictions['predicted'] = '0'
+    gold_match_rows = []
+    predicted_match_rows = []
+    for tuple_ in predictions.itertuples():
+        lid = getattr(tuple_, 'ltable_' + ltable_id)
+        map_row = mapping[mapping[mapping_lid] == lid]
+        result_row = rdf[rdf['id1'] == lid]
+        if map_row is not None:
+            rid = map_row[mapping_rid]
+            for value in rid:
+                if value == getattr(tuple_, 'rtable_' + rtable_id):
+                    gold_match_rows.append(tuple_[0])
+        if result_row is not None:
+            rid = result_row['id2']
+            for value in rid:
+                if value == getattr(tuple_, 'rtable_' + rtable_id):
+                    predicted_match_rows.append(tuple_[0])
+    for _ in gold_match_rows:
+        predictions.loc[_, 'gold'] = '1'
+    for _ in predicted_match_rows:
+        predictions.loc[_, 'predicted'] = '1'
+
+    predictions['confidence'] = 0
+    predicted_match = predictions[predictions['predicted'] == '1']
+    print(1)
diff --git a/settings.py b/settings.py
new file mode 100644
index 0000000..1535028
--- /dev/null
+++ b/settings.py
@@ -0,0 +1,22 @@
+from sentence_transformers import SentenceTransformer
+
+ltable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\abt.csv'
+rtable_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\buy.csv'
+mapping_path = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\datasets\JedAI\ccer\D2\gt.csv'
+mapping_lid = 'D1'  # mapping表中左表id名
+mapping_rid = 'D2'  # mapping表中右表id名
+ltable_id = 'id'  # 左表id字段名称
+rtable_id = 'id'  # 右表id字段名称
+target_attr = 'id'  # 进行md挖掘时的目标字段
+# lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
+
+model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
+interpre_weight = 0.5  # 可解释性权重
+similarity_threshold = 0.1
+support_threshold = 1
+confidence_threshold = 0.25
+
+er_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\ml_er\output'
+md_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\md_discovery\output'
+hpo_output_dir = r'E:\Data\Research\Projects\matching_dependency_pyJedAI\hpo\output'
+