11.13

2 years ago · ff52072867
parent 7ef6b87cb8
commit ff52072867
4 changed files with 17 additions and 16 deletions
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@ -63,7 +63,6 @@ class Classifier:

    # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
    def train(self, config: Configuration, seed: int = 0) -> float:
-        # print(f"BRAINFUCK:{config.values()}")
        cm.del_catalog()
        attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]  # 字段名加左前缀
        attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]  # 字段名加右前缀
@ -206,6 +205,7 @@ class Classifier:
        #     return 1
        # f1 = indicators["F1"]
        performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
+        print('Interpretability: ', interpretability)
        return 1 - performance


@ -221,7 +221,7 @@ def ml_er_hpo():
    scenario = Scenario(
        cs,
        deterministic=True,
-        n_trials=10,  # We want to run max 50 trials (combination of config and seed)
+        n_trials=12,  # We want to run max 50 trials (combination of config and seed)
        n_workers=1
    )

--- a/md_discovery/md_discover.py
+++ b/md_discovery/md_discover.py
@ -16,7 +16,6 @@ from settings import *


 def md_discover():
-    # 目前可以仿照这个main函数写
    t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
    # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
    # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值
--- a/md_discovery/tmp_discover.py
+++ b/md_discovery/tmp_discover.py
@ -59,13 +59,13 @@ def pairs_inference(path, threshold, target_col):
    norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
    sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))

-    torch.save(sim_tensor, md_output_dir + "tensor.pt")
+    # torch.save(sim_tensor, md_output_dir + "tensor.pt")

    md_list = []
    minimal_vio = []
    init_md = {}
    for col in columns:
-        init_md[col] = 1 if col == target_col else 0
+        init_md[col] = 1 if col == target_col else -1
    md_list.append(init_md)

    for row1 in range(0, length - 1):
@ -145,7 +145,6 @@ def pairs_inference(path, threshold, target_col):
        return md_list, []

    remove_list = []
-    # fuck = []
    if len(md_list) > 0:
        md_rm_list = []
        for _ in md_list:
--- a/settings.py
+++ b/settings.py
@ -1,20 +1,23 @@
 from sentence_transformers import SentenceTransformer
 import numpy as np

-ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableA.csv'
-rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableB.csv'
-mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\matches.csv'
-mapping_lid = 'ltable_id'  # mapping表中左表id名
-mapping_rid = 'rtable_id'  # mapping表中右表id名
+ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
+rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
+mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
+mapping_lid = 'idAbt'  # mapping表中左表id名
+mapping_rid = 'idBuy'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
 # lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
-similarity_threshold = 0.16
-support_threshold = 70
-confidence_threshold = 0.3
-interpre_weight = 1  # 可解释性权重
+
+model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
+similarity_threshold = 0.2
+support_threshold = 100
+confidence_threshold = 0.4
+interpre_weight = 0.5  # 可解释性权重
+
 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
 md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
 hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
-model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
+