11.2

2 years ago · 24985da169
parent 882c25d20f
commit 24985da169
18 changed files with 97856 additions and 139 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,4 @@
 /deprecated/
 /datasets/
 /ml_er/output/*
 /md_discovery/output/*
 /hpo/output/*
--- a/datasets/Abt-Buy/matches.csv
+++ b/datasets/Abt-Buy/matches.csv
--- a/datasets/Abt-Buy/tableA.csv
+++ b/datasets/Abt-Buy/tableA.csv
--- a/datasets/Abt-Buy/tableB.csv
+++ b/datasets/Abt-Buy/tableB.csv
--- a/datasets/Amazon-Google/Amazon.csv
+++ b/datasets/Amazon-Google/Amazon.csv
--- a/datasets/Amazon-Google/Amzon_GoogleProducts_perfectMapping.csv
+++ b/datasets/Amazon-Google/Amzon_GoogleProducts_perfectMapping.csv
--- a/datasets/Amazon-Google/GoogleProducts.csv
+++ b/datasets/Amazon-Google/GoogleProducts.csv
--- a/datasets/Walmart-Amazon_dirty/matches.csv
+++ b/datasets/Walmart-Amazon_dirty/matches.csv
--- a/datasets/Walmart-Amazon_dirty/tableA.csv
+++ b/datasets/Walmart-Amazon_dirty/tableA.csv
--- a/datasets/Walmart-Amazon_dirty/tableB.csv
+++ b/datasets/Walmart-Amazon_dirty/tableB.csv
--- a/datasets/iTunes-Amazon_dirty/tableA.csv
+++ b/datasets/iTunes-Amazon_dirty/tableA.csv
--- a/datasets/iTunes-Amazon_dirty/tableB.csv
+++ b/datasets/iTunes-Amazon_dirty/tableB.csv
--- a/entrance.py
+++ b/entrance.py
@ -1,27 +0,0 @@
 # this is the entrance of the auto-ER procedure
 from md_discovery.md_discover import md_discover
 from ml_er.ml_entity_resolver import ml_er
 from hpo.er_model_hpo import ml_er_hpo
 from settings import *
 def run(rounds: int):
    hp_config = None
    iter_round = 1
    for i in range(0, rounds):
        ml_er(iter_round, hp_config)
        md_discover()
        hp_config = ml_er_hpo()
        iter_round += 1
    ml_er(iter_round, hp_config)
    return
 if __name__ == '__main__':
    # todo
    #  使用drop删除特征向量中的列？(如删除id相关特征)
    run(1)  # 迭代3轮
    # ml_er(1)
    # todo 将优化结果与参数输出到文件中
    #  通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息
    print(ltable_path)
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@ -17,17 +17,17 @@ from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicabl
 # 数据在外部加载
 ########################################################################################################################
 ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
-ltable.fillna("", inplace=True)
+# ltable.fillna("", inplace=True)
 rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
-rtable.fillna("", inplace=True)
+# rtable.fillna("", inplace=True)
 mappings = pd.read_csv(mapping_path)
 lid_mapping_list = []
 rid_mapping_list = []
 # 全部转为字符串
-ltable = ltable.astype(str)
+# ltable = ltable.astype(str)
-rtable = rtable.astype(str)
+# rtable = rtable.astype(str)
-mappings = mappings.astype(str)
+# mappings = mappings.astype(str)
 matching_number = len(mappings)  # 所有阳性样本数，商品数据集应为1300
 for index, row in mappings.iterrows():
@ -35,7 +35,8 @@ for index, row in mappings.iterrows():
    rid_mapping_list.append(row[mapping_rid])
 # 仅保留两表中出现在映射表中的行，增大正样本比例
 selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
-selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
+if len(lr_attrs_map) > 0:
    selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
 tables_id = rtable_id  # 不论左表右表ID字段名是否一致，经上一行调整，统一以右表为准
 selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
 selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
@ -72,11 +73,13 @@ class Classifier:
            blocker = em.OverlapBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-                                             overlap_size=config["overlap_size"], show_progress=False)
+                                             overlap_size=config["overlap_size"], show_progress=False,
                                             allow_missing=True)
        elif config["ml_blocker"] == "attr_equiv":
            blocker = em.AttrEquivalenceBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
-                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs)
+                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
                                             allow_missing=True)
        candidate['gold'] = 0
@ -159,6 +162,8 @@ class Classifier:
                                                    attrs_after=test_feature_after, show_progress=False)
        fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
        train_feature_vecs.fillna(0, inplace=True)
        test_feature_vecs.fillna(0, inplace=True)
        matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
        test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
@ -176,7 +181,7 @@ class Classifier:
        predictions_attrs.extend(['gold', 'predicted'])
        predictions = predictions[predictions_attrs]
        predictions = predictions.reset_index(drop=True)
-        predictions = predictions.astype(str)
+        # predictions = predictions.astype(str)
        sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
        #  默认路径为 "../md_discovery/output/xxx.txt"
@ -191,14 +196,14 @@ class Classifier:
        ppre = predictions[predictions['predicted'] == str(1)]
        interpretability = epl_match / len(ppre)  # 可解释性
-        # todo block_recall可以考虑以下注释
+
-        # if indicators["block_recall"] >= 0.8:
+        if indicators["block_recall"] >= 0.8:
-        #     f1 = indicators["F1"]
+            f1 = indicators["F1"]
-        # else:
+        else:
-        #     f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
+            f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
-        if indicators["block_recall"] < 0.8:
+        # if indicators["block_recall"] < 0.8:
-            return 1
+        #     return 1
-        f1 = indicators["F1"]
+        # f1 = indicators["F1"]
        performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
        return 1 - performance
@ -230,17 +235,20 @@ def ml_er_hpo():
    )
    incumbent = smac.optimize()
-    incumbent_ndarray = incumbent.get_array()
+    incumbent_cost = smac.validate(incumbent)
-    np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray)
+    default = cs.get_default_configuration()
    default_cost = smac.validate(default)
    print(f"Default Cost: {default_cost}")
    print(f"Incumbent Cost: {incumbent_cost}")
-    # Get cost of default configuration
+    if incumbent_cost > default_cost:
-    default_cost = smac.validate(cs.get_default_configuration())
+        incumbent = default
-    print(f"Default cost: {default_cost}")
+        print(f"Updated Incumbent Cost: {default_cost}")
-    # Let's calculate the cost of the incumbent
+    print(f"Optimized Configuration:{incumbent.values()}")
-    incumbent_cost = smac.validate(incumbent)
+
-    print(f"Incumbent cost: {incumbent_cost}")
+    incumbent_ndarray = incumbent.get_array()
-    print(f"Optimized_Configuration:{incumbent.values()}")
+    np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray)
    return incumbent
--- a/md_discovery/tmp_discover.py
+++ b/md_discovery/tmp_discover.py
@ -10,9 +10,7 @@ import copy
 import torch
 from tqdm import tqdm
-from settings import model, md_output_dir
+from settings import model, md_output_dir, confidence_threshold
 conf_thresh = 0.8
 def is_minimal(md, md_list, target_col):
@ -60,7 +58,6 @@ def pairs_inference(path, threshold, target_col):
    table_tensor = torch.stack(split_embedding, dim=0, out=None)
    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
    sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
    sim_tensor = sim_tensor/2 + 0.5
    torch.save(sim_tensor, md_output_dir + "tensor.pt")
@ -130,7 +127,7 @@ def pairs_inference(path, threshold, target_col):
        if support < 1:
            print('delete by support')
            remove_list.append(md)
-        if confidence < 0.5:
+        if confidence < confidence_threshold:
            print('delete by confidence')
            remove_list.append(md)
    # fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
@ -172,54 +169,3 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
    confidence = confidence_numerator / support
    return support, confidence
 def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
    data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
    data.fillna("", inplace=True)
    data = data.astype(str)
    manager = multiprocessing.Manager()
    if len(md_list) == 0:
        return []
    pool_size = 16
    pool = multiprocessing.Pool(pool_size)
    result = []
    with manager:
        for _ in md_list:
            task = pool.apply_async(get_one_md_metadata, args=(_, data, sim_tensor, target_col))
            support, confidence = task.get()
            result.append({"md": _, "support": support, "confidence": confidence})
        pool.close()
        pool.join()
    return result
 def get_one_md_metadata(md, dataframe, sim_tensor, target_col):
    support = 0
    pre_confidence = 0
    columns = dataframe.columns.values.tolist()
    length = dataframe.shape[0]
    width = dataframe.shape[1]
    for row1 in range(0, length - 1):
        for row2 in range(row1 + 1, length):
            left_satisfy = True
            both_satisfy = True
            for col_index in range(0, width):
                col = columns[col_index]
                sim = sim_tensor[col_index, row1, row2].item()
                if col == target_col:
                    if sim < 1:
                        both_satisfy = False
                else:
                    if sim < md[col]:
                        left_satisfy = False
                        both_satisfy = False
            if left_satisfy:
                support += 1
            if both_satisfy:
                pre_confidence += 1
    confidence = 0 if support == 0 else pre_confidence / support
    # return {"md": md, "support": support, "confidence": confidence}
    return support, confidence
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@ -46,7 +46,7 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame,
    rdf.columns = cols
    t_single_tuple = pd.concat([ldf, rdf])
-    t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True)
+    t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@ -140,45 +140,42 @@ def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
        rattr_tensor = norm_table_tensor[col_tuple[1]]
        mul_tensor = lattr_tensor * rattr_tensor
        sim_tensor = torch.sum(mul_tensor, 1)
        sim_tensor = sim_tensor / 2 + 0.5
        sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
    return sim_tensor_dict
 def ml_er(iter_round: int, config: Configuration = None, ):
    # todo:
    #  if config is not None -> load configs
    #  else -> use default configs
    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
    cm.set_key(ltable, ltable_id)
-    ltable.fillna("", inplace=True)
+    # ltable.fillna("", inplace=True)
    rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
    cm.set_key(rtable, rtable_id)
-    rtable.fillna("", inplace=True)
+    # rtable.fillna("", inplace=True)
-    mappings = pd.read_csv(mapping_path)
+    mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
    # 仅保留两表中出现在映射表中的行，增大正样本比例
    lid_mapping_list = []
    rid_mapping_list = []
    # 全部转为字符串
-    ltable = ltable.astype(str)
+    # ltable = ltable.astype(str)
-    rtable = rtable.astype(str)
+    # rtable = rtable.astype(str)
-    mappings = mappings.astype(str)
+    # mappings = mappings.astype(str)
-    matching_number = len(mappings)  # 所有阳性样本数，商品数据集应为1300
+    matching_number = len(mappings)  # 所有阳性样本数
    for index, row in mappings.iterrows():
        lid_mapping_list.append(row[mapping_lid])
        rid_mapping_list.append(row[mapping_rid])
    selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
-    selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
+    if len(lr_attrs_map) > 0:
        selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
    tables_id = rtable_id
    selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
    selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
    items_but_id = selected_attrs[:]
    items_but_id.remove(tables_id)  # 两张表中除了id的字段名
-    attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
+    attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]
-    attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
+    attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]
    cm.set_key(selected_ltable, tables_id)
    cm.set_key(selected_rtable, tables_id)
@ -199,19 +196,21 @@ def ml_er(iter_round: int, config: Configuration = None, ):
        if config["ml_blocker"] == "over_lap":
            blocker = em.OverlapBlocker()
-            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
+            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
                                             config["block_attr"], allow_missing=True,
                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
                                             overlap_size=config["overlap_size"], show_progress=False)
        elif config["ml_blocker"] == "attr_equiv":
            blocker = em.AttrEquivalenceBlocker()
-            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
+            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
                                             config["block_attr"], allow_missing=True,
                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs)
    else:
        matcher = em.RFMatcher(name='RF', random_state=0)
        blocker = em.OverlapBlocker()
        candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0],
                                         l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
-                                         overlap_size=1, show_progress=False)
+                                         overlap_size=1, show_progress=False, allow_missing=True)
    candidate['gold'] = 0
@ -267,10 +266,12 @@ def ml_er(iter_round: int, config: Configuration = None, ):
                                                attrs_after=test_feature_after, show_progress=False)
    fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
    train_feature_vecs.fillna(0, inplace=True)
    test_feature_vecs.fillna(0, inplace=True)
    matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
    test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
    predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
-                             append=True, target_attr='predicted', inplace=False)
+                                  append=True, target_attr='predicted', inplace=False)
    eval_result = em.eval_matches(predictions, 'gold', 'predicted')
    em.print_eval_summary(eval_result)
    indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, test_proportion)
@ -301,7 +302,8 @@ def ml_er(iter_round: int, config: Configuration = None, ):
    if indicators["block_recall"] >= 0.8:
        f1 = indicators["F1"]
    else:
-        f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
+        f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
                    indicators["precision"] + indicators["block_recall"])
    performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
    ################################################################################################################
--- a/settings.py
+++ b/settings.py
@ -1,20 +1,20 @@
 from sentence_transformers import SentenceTransformer
 import numpy as np
-ltable_path = 'E:\\Data\\Research\\Projects\\matching_dependency\\datasets\\Amazon.csv'
+ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableA.csv'
-rtable_path = 'E:\\Data\\Research\\Projects\\matching_dependency\\datasets\\GoogleProducts.csv'
+rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableB.csv'
-mapping_path = 'E:\\Data\\Research\\Projects\\matching_dependency\\datasets\\Amzon_GoogleProducts_perfectMapping.csv'
+mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv'
-mapping_lid = 'idAmazon'  # mapping表中左表id名
+mapping_lid = 'id1'  # mapping表中左表id名
-mapping_rid = 'idGoogleBase'  # mapping表中右表id名
+mapping_rid = 'id2'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
-lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
+lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
-similarity_threshold = 0.7
+similarity_threshold = 0.2
 support_threshold = 1
-confidence_threshold = 0.8
+confidence_threshold = 0.5
 interpre_weight = 0.3  # 可解释性权重
 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
 md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
 hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
-model = SentenceTransformer('E:\\Data\\Research\\Models\\paraphrase-MiniLM-L6-v2')
+model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
--- a/tfile.py
+++ b/tfile.py
@ -116,6 +116,13 @@ def test8():
    print(cum)
 def test9():
    df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True)
    df.to_csv(r'./datasets/s.csv')
    d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1')
    print(1)
 if __name__ == '__main__':
    start = time.time()
    t_single_tuple_path = er_output_dir + "t_single_tuple.csv"