MD-metrics-HPO
HuangJintao 1 year ago
parent 7ef6b87cb8
commit ff52072867

@ -63,7 +63,6 @@ class Classifier:
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估 # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
def train(self, config: Configuration, seed: int = 0) -> float: def train(self, config: Configuration, seed: int = 0) -> float:
# print(f"BRAINFUCK:{config.values()}")
cm.del_catalog() cm.del_catalog()
attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀 attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀
attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀 attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀
@ -206,6 +205,7 @@ class Classifier:
# return 1 # return 1
# f1 = indicators["F1"] # f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
print('Interpretability: ', interpretability)
return 1 - performance return 1 - performance
@ -221,7 +221,7 @@ def ml_er_hpo():
scenario = Scenario( scenario = Scenario(
cs, cs,
deterministic=True, deterministic=True,
n_trials=10, # We want to run max 50 trials (combination of config and seed) n_trials=12, # We want to run max 50 trials (combination of config and seed)
n_workers=1 n_workers=1
) )

@ -16,7 +16,6 @@ from settings import *
def md_discover(): def md_discover():
# 目前可以仿照这个main函数写
t_single_tuple_path = er_output_dir + "t_single_tuple.csv" t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值 # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值

@ -59,13 +59,13 @@ def pairs_inference(path, threshold, target_col):
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
torch.save(sim_tensor, md_output_dir + "tensor.pt") # torch.save(sim_tensor, md_output_dir + "tensor.pt")
md_list = [] md_list = []
minimal_vio = [] minimal_vio = []
init_md = {} init_md = {}
for col in columns: for col in columns:
init_md[col] = 1 if col == target_col else 0 init_md[col] = 1 if col == target_col else -1
md_list.append(init_md) md_list.append(init_md)
for row1 in range(0, length - 1): for row1 in range(0, length - 1):
@ -145,7 +145,6 @@ def pairs_inference(path, threshold, target_col):
return md_list, [] return md_list, []
remove_list = [] remove_list = []
# fuck = []
if len(md_list) > 0: if len(md_list) > 0:
md_rm_list = [] md_rm_list = []
for _ in md_list: for _ in md_list:

@ -1,20 +1,23 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import numpy as np import numpy as np
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableA.csv' ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableB.csv' rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\matches.csv' mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
mapping_lid = 'ltable_id' # mapping表中左表id名 mapping_lid = 'idAbt' # mapping表中左表id名
mapping_rid = 'rtable_id' # mapping表中右表id名 mapping_rid = 'idBuy' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称 ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称 rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段 target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 # lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
similarity_threshold = 0.16
support_threshold = 70 model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
confidence_threshold = 0.3 similarity_threshold = 0.2
interpre_weight = 1 # 可解释性权重 support_threshold = 100
confidence_threshold = 0.4
interpre_weight = 0.5 # 可解释性权重
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\' hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')

Loading…
Cancel
Save