From ff520728675347ec8f8691d2e1e865ee8d48c47a Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Mon, 13 Nov 2023 10:34:19 +0800 Subject: [PATCH] 11.13 --- hpo/er_model_hpo.py | 4 ++-- md_discovery/md_discover.py | 1 - md_discovery/tmp_discover.py | 5 ++--- settings.py | 23 +++++++++++++---------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py index 16c7d4c..486bfc0 100644 --- a/hpo/er_model_hpo.py +++ b/hpo/er_model_hpo.py @@ -63,7 +63,6 @@ class Classifier: # train 就是整个函数 只需将返回结果由预测变成预测结果的评估 def train(self, config: Configuration, seed: int = 0) -> float: - # print(f"BRAINFUCK:{config.values()}") cm.del_catalog() attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀 attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀 @@ -206,6 +205,7 @@ class Classifier: # return 1 # f1 = indicators["F1"] performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 + print('Interpretability: ', interpretability) return 1 - performance @@ -221,7 +221,7 @@ def ml_er_hpo(): scenario = Scenario( cs, deterministic=True, - n_trials=10, # We want to run max 50 trials (combination of config and seed) + n_trials=12, # We want to run max 50 trials (combination of config and seed) n_workers=1 ) diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py index 876513a..ae5d50b 100644 --- a/md_discovery/md_discover.py +++ b/md_discovery/md_discover.py @@ -16,7 +16,6 @@ from settings import * def md_discover(): - # 目前可以仿照这个main函数写 t_single_tuple_path = er_output_dir + "t_single_tuple.csv" # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值 diff --git a/md_discovery/tmp_discover.py b/md_discovery/tmp_discover.py index a9f58b6..d4d7870 100644 --- a/md_discovery/tmp_discover.py +++ b/md_discovery/tmp_discover.py @@ -59,13 +59,13 @@ def pairs_inference(path, threshold, target_col): norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) - torch.save(sim_tensor, md_output_dir + "tensor.pt") + # torch.save(sim_tensor, md_output_dir + "tensor.pt") md_list = [] minimal_vio = [] init_md = {} for col in columns: - init_md[col] = 1 if col == target_col else 0 + init_md[col] = 1 if col == target_col else -1 md_list.append(init_md) for row1 in range(0, length - 1): @@ -145,7 +145,6 @@ def pairs_inference(path, threshold, target_col): return md_list, [] remove_list = [] - # fuck = [] if len(md_list) > 0: md_rm_list = [] for _ in md_list: diff --git a/settings.py b/settings.py index c257342..9537ded 100644 --- a/settings.py +++ b/settings.py @@ -1,20 +1,23 @@ from sentence_transformers import SentenceTransformer import numpy as np -ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableA.csv' -rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableB.csv' -mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\matches.csv' -mapping_lid = 'ltable_id' # mapping表中左表id名 -mapping_rid = 'rtable_id' # mapping表中右表id名 +ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv' +rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv' +mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv' +mapping_lid = 'idAbt' # mapping表中左表id名 +mapping_rid = 'idBuy' # mapping表中右表id名 ltable_id = 'id' # 左表id字段名称 rtable_id = 'id' # 右表id字段名称 target_attr = 'id' # 进行md挖掘时的目标字段 # lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 -similarity_threshold = 0.16 -support_threshold = 70 -confidence_threshold = 0.3 -interpre_weight = 1 # 可解释性权重 + +model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens') +similarity_threshold = 0.2 +support_threshold = 100 +confidence_threshold = 0.4 +interpre_weight = 0.5 # 可解释性权重 + er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\' -model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens') +