11.14

2 years ago · 6bdfd9eb51
parent 9a4dfde047
commit 6bdfd9eb51
7 changed files with 71 additions and 272 deletions
--- a/datasets/Amazon-Google/Amzon_GoogleProducts_perfectMapping.csv
+++ b/datasets/Amazon-Google/Amzon_GoogleProducts_perfectMapping.csv
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@ -1,21 +1,12 @@
-import os
-
-import numpy as np
-import torch
 import json
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
 from ConfigSpace.conditions import InCondition
 from ConfigSpace.read_and_write import json as csj
-import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
-
 from smac import HyperparameterOptimizationFacade, Scenario
-
-from md_discovery.md_discover import md_discover
 from settings import *
-from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable, build_col_pairs_sim_tensor_dict, \
-    process_prediction_for_md_discovery, er_process
+from ml_er.ml_entity_resolver import er_process


 class Classifier:
@ -29,17 +20,14 @@ class Classifier:
        block_attr_items.remove(ltable_id)

        block_attr = Categorical("block_attr", block_attr_items)
-        overlap_size = Integer("overlap_size", (1, 3), default=1)
        ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
        ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
-        similarity_thresh = Float("similarity_thresh", (0.2, 0.21))
-        support_thresh = Integer("support_thresh", (1, 1000))
-        confidence_thresh = Float("confidence_thresh", (0.25, 0.5))
+        similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2)
+        support_thresh = Integer("support_thresh", (1, 5), default=1)
+        confidence_thresh = Float("confidence_thresh", (0.3, 0.7), default=0.4)

-        use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
-        cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker,
-                                similarity_thresh, support_thresh, confidence_thresh])
-        cs.add_conditions([use_overlap_size])
+        cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh,
+                                support_thresh, confidence_thresh])
        return cs

    # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
@ -48,183 +36,6 @@ class Classifier:
        indicators = er_process(config)
        return 1-indicators['performance']

-        # # 数据在外部加载
-        # ########################################################################################################################
-        # ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
-        # cm.set_key(ltable, ltable_id)
-        # # ltable.fillna("", inplace=True)
-        # rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
-        # cm.set_key(rtable, rtable_id)
-        # # rtable.fillna("", inplace=True)
-        # mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
-        #
-        # lid_mapping_list = []
-        # rid_mapping_list = []
-        # # 全部转为字符串
-        # # ltable = ltable.astype(str)
-        # # rtable = rtable.astype(str)
-        # # mappings = mappings.astype(str)
-        # matching_number = len(mappings)  # 所有阳性样本数，商品数据集应为1300
-        #
-        # for index, row in mappings.iterrows():
-        #     lid_mapping_list.append(row[mapping_lid])
-        #     rid_mapping_list.append(row[mapping_rid])
-        # # 仅保留两表中出现在映射表中的行，增大正样本比例
-        # selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
-        # # if len(lr_attrs_map) > 0:
-        # #     selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
-        # tables_id = rtable_id  # 不论左表右表ID字段名是否一致，经上一行调整，统一以右表为准
-        # selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
-        # selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
-        # ########################################################################################################################
-        #
-        # attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]  # 字段名加左前缀
-        # attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]  # 字段名加右前缀
-        # cm.set_key(selected_ltable, tables_id)
-        # cm.set_key(selected_rtable, tables_id)
-        #
-        # if config["ml_blocker"] == "over_lap":
-        #     blocker = em.OverlapBlocker()
-        #     candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
-        #                                      l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-        #                                      overlap_size=config["overlap_size"], show_progress=False,
-        #                                      allow_missing=True)
-        # elif config["ml_blocker"] == "attr_equiv":
-        #     blocker = em.AttrEquivalenceBlocker()
-        #     candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
-        #                                      l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-        #                                      allow_missing=True)
-        #
-        # candidate['gold'] = 0
-        # candidate = candidate.reset_index(drop=True)
-        # candidate_match_rows = []
-        # for line in candidate.itertuples():
-        #     l_id = getattr(line, 'ltable_' + tables_id)
-        #     map_row = mappings[mappings[mapping_lid] == l_id]
-        #
-        #     if map_row is not None:
-        #         r_id = map_row[mapping_rid]
-        #         for value in r_id:
-        #             if value == getattr(line, 'rtable_' + tables_id):
-        #                 candidate_match_rows.append(line[0])
-        #     else:
-        #         continue
-        # for _ in candidate_match_rows:
-        #     candidate.loc[_, 'gold'] = 1
-        #
-        # candidate.fillna("", inplace=True)
-        #
-        # # 裁剪负样本，保持正负样本数量一致
-        # candidate_mismatch = candidate[candidate['gold'] == 0]
-        # candidate_match = candidate[candidate['gold'] == 1]
-        # if len(candidate_mismatch) > len(candidate_match):
-        #     candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
-        # # 拼接正负样本
-        # candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
-        # if len(candidate_for_train_test) == 0:
-        #     return 1
-        # candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
-        # cm.set_key(candidate_for_train_test, '_id')
-        # cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
-        # cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
-        # cm.set_ltable(candidate_for_train_test, selected_ltable)
-        # cm.set_rtable(candidate_for_train_test, selected_rtable)
-        #
-        # # 分为训练测试集
-        # train_proportion = 0.7
-        # test_proportion = 0.3
-        # sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
-        # train_set = sets['train']
-        # test_set = sets['test']
-        #
-        # cm.set_key(train_set, '_id')
-        # cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
-        # cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
-        # cm.set_ltable(train_set, selected_ltable)
-        # cm.set_rtable(train_set, selected_rtable)
-        #
-        # cm.set_key(test_set, '_id')
-        # cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
-        # cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
-        # cm.set_ltable(test_set, selected_ltable)
-        # cm.set_rtable(test_set, selected_rtable)
-        #
-        # if config["ml_matcher"] == "dt":
-        #     matcher = em.DTMatcher(name='DecisionTree', random_state=0)
-        # elif config["ml_matcher"] == "svm":
-        #     matcher = em.SVMMatcher(name='SVM', random_state=0)
-        # elif config["ml_matcher"] == "rf":
-        #     matcher = em.RFMatcher(name='RF', random_state=0)
-        # elif config["ml_matcher"] == "lg":
-        #     matcher = em.LogRegMatcher(name='LogReg', random_state=0)
-        # elif config["ml_matcher"] == "ln":
-        #     matcher = em.LinRegMatcher(name='LinReg')
-        # elif config["ml_matcher"] == "nb":
-        #     matcher = em.NBMatcher(name='NaiveBayes')
-        # feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
-        #
-        # train_feature_vecs = em.extract_feature_vecs(train_set,
-        #                                              feature_table=feature_table,
-        #                                              attrs_after=['gold'],
-        #                                              show_progress=False)
-        # train_feature_vecs.fillna(value=0, inplace=True)
-        # test_feature_after = attrs_with_l_prefix[:]
-        # test_feature_after.extend(attrs_with_r_prefix)
-        # for _ in test_feature_after:
-        #     if _.endswith(tables_id):
-        #         test_feature_after.remove(_)
-        # test_feature_after.append('gold')
-        # test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
-        #                                             attrs_after=test_feature_after, show_progress=False)
-        # test_feature_vecs.fillna(value=0, inplace=True)
-        # fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
-        # matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
-        #
-        # test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
-        # predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
-        #                               append=True, target_attr='predicted', inplace=False)
-        # eval_result = em.eval_matches(predictions, 'gold', 'predicted')
-        # em.print_eval_summary(eval_result)
-        # indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, candidate_for_train_test)
-        # print(indicators)
-        #
-        # # 计算可解释性
-        # predictions_attrs = []
-        # predictions_attrs.extend(attrs_with_l_prefix)
-        # predictions_attrs.extend(attrs_with_r_prefix)
-        # predictions_attrs.extend(['gold', 'predicted'])
-        # predictions = predictions[predictions_attrs]
-        # process_prediction_for_md_discovery(predictions)
-        # predictions = predictions.reset_index(drop=True)
-        # predictions = predictions.astype(str)
-        # sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
-        #
-        # #  默认路径为 "../md_discovery/output/xxx.txt"
-        # #  mds/vio  共2个md文件
-        # md_discover(config)
-        # md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
-        # md_list = load_mds(md_paths)  # 从全局变量中读取所有的md
-        # epl_match = 0  # 可解释，预测match
-        # if len(md_list) > 0:
-        #     for line in predictions.itertuples():
-        #         if is_explicable(line, md_list, sim_tensor_dict) and str(getattr(line, 'predicted')) == str(1):
-        #             epl_match += 1
-        #
-        # ppre = predictions[predictions['predicted'] == str(1)]
-        # interpretability = epl_match / len(ppre)  # 可解释性
-        #
-        # if (indicators["block_recall"] < 0.8) and (indicators["block_recall"] < indicators["recall"]):
-        #     f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
-        #                 indicators["precision"] + indicators["block_recall"])
-        # else:
-        #     f1 = indicators["F1"]
-        # # if indicators["block_recall"] < 0.8:
-        # #     return 1
-        # # f1 = indicators["F1"]
-        # performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
-        # print('Interpretability: ', interpretability)
-        # return 1 - performance
-

 def ml_er_hpo():
    classifier = Classifier()
@ -234,7 +45,6 @@ def ml_er_hpo():
    with open(hpo_output_dir + "configspace.json", "w") as f:
        json.dump(dict_configspace, f, indent=4)

-    # Next, we create an object, holding general information about the run
    scenario = Scenario(
        cs,
        deterministic=True,
@ -244,7 +54,6 @@ def ml_er_hpo():

    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

-    # Now we use SMAC to find the best hyperparameters
    smac = HyperparameterOptimizationFacade(
        scenario,
        classifier.train,
--- a/md_discovery/discovery_executor.py
+++ b/md_discovery/discovery_executor.py
@ -1,16 +1,9 @@
-import multiprocessing
-import time
-from concurrent.futures import ProcessPoolExecutor
-from multiprocessing.managers import SharedMemoryManager
-
 import numpy as np
 import pandas as pd
 import copy
-
 import torch
 from ConfigSpace import Configuration
 from tqdm import tqdm
-
 from settings import model


@ -97,59 +90,58 @@ def pairs_inference(path, target_col, conf: Configuration):
                    md_list.remove(md)
                    violated_mds.append(md)

-            for vio_md in violated_mds:
-                # 特殊化左侧
-                for col in cols_but_target:
-                    if sims[col] + 0.01 <= 1:
-                        spec_l_md = copy.deepcopy(vio_md)
-                        spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
-                        if is_minimal(spec_l_md, md_list, target_col):
-                            md_list.append(spec_l_md)
-                if vio_md not in minimal_vio:
-                    minimal_vio.append(vio_md)
            # for vio_md in violated_mds:
            #     # 特殊化左侧
            #     for col in cols_but_target:
-            #         if sims[col] < 1:
-            #             spec_l_md = copy.deepcopy(vio_md)
-            #             if sims[col] < threshold:
-            #                 spec_l_md[col] = threshold
-            #             else:
            #         if sims[col] + 0.01 <= 1:
-            #                     spec_l_md[col] = sims[col] + 0.01
-            #                 else:
-            #                     spec_l_md[col] = 1
+            #             spec_l_md = copy.deepcopy(vio_md)
+            #             spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
            #             if is_minimal(spec_l_md, md_list, target_col):
            #                 md_list.append(spec_l_md)
            #     if vio_md not in minimal_vio:
            #         minimal_vio.append(vio_md)
+
+            for vio_md in violated_mds:
+                vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
+                if vio_md_support >= supt:
+                    for col in cols_but_target:
+                        if sims[col] < 1.0:
+                            spec_l_md = copy.deepcopy(vio_md)
+                            if sims[col] < simt:
+                                spec_l_md[col] = simt
+                            else:
+                                if sims[col] + 0.01 <= 1.0:
+                                    spec_l_md[col] = sims[col] + 0.01
+                                else:
+                                    spec_l_md[col] = 1.0
+                            if is_minimal(spec_l_md, md_list, target_col):
+                                md_list.append(spec_l_md)
+                    if vio_md not in minimal_vio:
+                        minimal_vio.append(vio_md)
+
            if len(md_list) == 0:
                terminate = True
                break
        if terminate:
            break

+    if len(minimal_vio) > 0:
+        remove_list = []
+        for md in minimal_vio:
+            support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
+            if confidence < cont:
+                remove_list.append(md)
+        for _ in remove_list:
+            minimal_vio.remove(_)
+
+    if len(md_list) > 0:
+        # 去除重复MD
        tmp = []
        for _ in md_list:
            if _ not in tmp:
                tmp.append(_)
        md_list = tmp
-    if len(md_list) > 0:
-        for vio in minimal_vio[:]:
-            if not is_minimal(vio, md_list, target_col):
-                minimal_vio.remove(vio)
-        for _ in md_list[:]:
-            if not is_minimal(_, md_list, target_col):
-                md_list.remove(_)
-
-    print('mds_list\t', len(md_list), '\n')
-    print('vio_list\t', len(minimal_vio), '\n')
-
-    if len(minimal_vio) == 0:
-        return md_list, []
-
-    remove_list = []
-    if len(md_list) > 0:
+        # 去除support小于阈值MD
        md_rm_list = []
        for _ in md_list:
            support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
@ -157,23 +149,23 @@ def pairs_inference(path, target_col, conf: Configuration):
                md_rm_list.append(_)
        for _ in md_rm_list:
            md_list.remove(_)
-    for md in minimal_vio:
-        support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
-        # fuck.append((support, confidence))
-        if support < supt:
-            remove_list.append(md)
-        if confidence < cont and md not in remove_list:
-            remove_list.append(md)
-    # fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
-    for _ in remove_list:
-        minimal_vio.remove(_)
+        # 去除不minimal的MD
+        for _ in md_list[:]:
+            if not is_minimal(_, md_list, target_col):
+                md_list.remove(_)
+        if len(minimal_vio) > 0:
+            for vio in minimal_vio[:]:
+                if not is_minimal(vio, md_list, target_col):
+                    minimal_vio.remove(vio)

+    if len(minimal_vio) > 0:
        for _ in minimal_vio[:]:
            if not is_minimal(_, minimal_vio, target_col):
                minimal_vio.remove(_)

-    print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
-    print(f"Support: {supt}\tConfidence: {cont}")
+    print(f'\033[33mList Length: {len(md_list)}\033[0m')
+    print(f'\033[33mVio Length: {len(minimal_vio)}\033[0m')
+    print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m')

    return md_list, minimal_vio

@ -196,11 +188,11 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
            sup_tensor_slice = sup_tensor[i]
            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
    sup_tensor_int = ini_slice.int()
-    support = torch.count_nonzero(sup_tensor_int).item()
+    support_Naumann = torch.count_nonzero(sup_tensor_int).item()

    ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
    conf_tensor_int = ini_slice.int()
-    confidence_numerator = torch.count_nonzero(conf_tensor_int).item()
-    confidence = confidence_numerator / support
+    support_Fan = torch.count_nonzero(conf_tensor_int).item()
+    confidence = support_Fan / support_Naumann

-    return support, confidence
+    return support_Fan, confidence
--- a/md_discovery/md_discover.py
+++ b/md_discovery/md_discover.py
@ -1,6 +1,6 @@
 from ConfigSpace import Configuration

-from md_discovery import tmp_discover
+from md_discovery.discovery_executor import pairs_inference
 from settings import *

 # # 若不输出support和confidence，使用以下两块代码
@ -21,7 +21,7 @@ def md_discover(config: Configuration):
    t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
    # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
    # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值
-    mds_list, vio_list = tmp_discover.pairs_inference(t_single_tuple_path, target_attr, config)
+    mds_list, vio_list = pairs_inference(t_single_tuple_path, target_attr, config)

    # 将列表1写入本地，路径需自己修改
    mds_path = md_output_dir + "mds.txt"
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -184,7 +184,7 @@ def er_process(config: Configuration):
        candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
                                         config["block_attr"], allow_missing=True,
                                         l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-                                         overlap_size=config["overlap_size"], show_progress=False)
+                                         overlap_size=1, show_progress=False)
    elif config["ml_blocker"] == "attr_equiv":
        blocker = em.AttrEquivalenceBlocker()
        candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
@ -336,7 +336,6 @@ def ml_er(config: Configuration = None):
        f.write('performance:' + str(indicators['performance']) + '\n')


-
 if __name__ == '__main__':
    if os.path.isfile(hpo_output_dir + "incumbent.json"):
        with open(hpo_output_dir + "configspace.json", 'r') as f:
--- a/settings.py
+++ b/settings.py
@ -1,11 +1,10 @@
 from sentence_transformers import SentenceTransformer
-import numpy as np

-ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
-rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
-mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
-mapping_lid = 'idAbt'  # mapping表中左表id名
-mapping_rid = 'idBuy'  # mapping表中右表id名
+ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableA.csv'
+rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableB.csv'
+mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\matches.csv'
+mapping_lid = 'ltable_id'  # mapping表中左表id名
+mapping_rid = 'rtable_id'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
--- a/tfile.py
+++ b/tfile.py
@ -9,7 +9,7 @@ import pandas as pd
 import torch
 from tqdm import tqdm
 from ConfigSpace.read_and_write import json as csj
-from md_discovery import tmp_discover
+from md_discovery import discovery_executor
 from settings import er_output_dir, hpo_output_dir