1.所有相似度计算依靠GPU

2.迭代轮数自动识别
3.超参数优化结果落盘
HuangJintao
HuangJintao 1 year ago
parent 20c33c0fd8
commit 882c25d20f

3
.gitignore vendored

@ -1,2 +1,5 @@
/deprecated/
/datasets/
/ml_er/output/*
/md_discovery/output/*
/hpo/output/*

@ -1,14 +1,18 @@
import os
import numpy as np
import torch
import json
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
from ConfigSpace.conditions import InCondition
from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario
from settings import *
from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable
from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable, build_col_pairs_sim_tensor_dict
# 数据在外部加载
########################################################################################################################
@ -171,23 +175,23 @@ class Classifier:
predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs]
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
# 默认路径为 "../md_discovery/output/xxx.txt"
# 真阳/假阴 mds/vio 共4个md文件
md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt',
md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt']
epl_match = 0 # 可解释预测match
nepl_mismatch = 0 # 不可解释预测mismatch
# mds/vio 共2个md文件
md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
if len(md_list) > 0:
for line in predictions.itertuples():
if is_explicable(line, md_list):
if getattr(line, 'predicted') == 1:
epl_match += 1
else:
if getattr(line, 'predicted') == 0:
nepl_mismatch += 1
interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
if is_explicable(line, md_list, sim_tensor_dict) and str(getattr(line, 'predicted')) == str(1):
epl_match += 1
ppre = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(ppre) # 可解释性
# todo block_recall可以考虑以下注释
# if indicators["block_recall"] >= 0.8:
# f1 = indicators["F1"]
# else:
@ -201,10 +205,15 @@ class Classifier:
def ml_er_hpo():
classifier = Classifier()
cs = classifier.configspace
str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace)
with open(hpo_output_dir + "configspace.json", "w") as f:
json.dump(dict_configspace, f)
# Next, we create an object, holding general information about the run
scenario = Scenario(
classifier.configspace,
cs,
deterministic=True,
n_trials=10, # We want to run max 50 trials (combination of config and seed)
n_workers=1
@ -221,9 +230,11 @@ def ml_er_hpo():
)
incumbent = smac.optimize()
incumbent_ndarray = incumbent.get_array()
np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray)
# Get cost of default configuration
default_cost = smac.validate(classifier.configspace.get_default_configuration())
default_cost = smac.validate(cs.get_default_configuration())
print(f"Default cost: {default_cost}")
# Let's calculate the cost of the incumbent
@ -235,4 +246,4 @@ def ml_er_hpo():
if __name__ == '__main__':
print(1)
ml_er_hpo()

@ -1,5 +1,4 @@
from md_discovery.multi_process_infer_by_pairs import inference_from_record_pairs
from md_discovery.multi_process_infer_by_pairs import get_mds_metadata
from md_discovery import tmp_discover
from settings import *
# # 若不输出support和confidence使用以下两块代码
@ -18,51 +17,25 @@ from settings import *
def md_discover():
# 目前可以仿照这个main函数写
tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv"
fn_single_tuple_path = er_output_dir + "fn_single_tuple.csv"
t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
# 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
# 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段
tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
fn_mds, fn_vio = inference_from_record_pairs(fn_single_tuple_path, similarity_threshold, target_attr)
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值
mds_list, vio_list = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr)
# 如果不需要输出support和confidence去掉下面两行
tp_mds_meta = get_mds_metadata(tp_mds, tp_single_tuple_path, target_attr)
tp_vio_meta = get_mds_metadata(tp_vio, tp_single_tuple_path, target_attr)
fn_mds_meta = get_mds_metadata(fn_mds, fn_single_tuple_path, target_attr)
fn_vio_meta = get_mds_metadata(fn_vio, fn_single_tuple_path, target_attr)
# 若输出support和confidence使用以下两块代码
# 将列表1写入本地路径需自己修改
tp_mds_path = md_output_dir + "tp_mds.txt"
tp_vio_path = md_output_dir + "tp_vio.txt"
mds_path = md_output_dir + "mds.txt"
vio_path = md_output_dir + "vio.txt"
with open(tp_mds_path, 'w') as f:
for _ in tp_mds_meta:
for i in _.keys():
f.write(i + ':' + str(_[i]) + '\t')
f.write('\n')
with open(tp_vio_path, 'w') as f:
for _ in tp_vio_meta:
for i in _.keys():
f.write(i + ':' + str(_[i]) + '\t')
f.write('\n')
fn_mds_path = md_output_dir + "fn_mds.txt"
fn_vio_path = md_output_dir + "fn_vio.txt"
with open(fn_mds_path, 'w') as f:
for _ in fn_mds_meta:
for i in _.keys():
f.write(i + ':' + str(_[i]) + '\t')
with open(mds_path, 'w') as f:
for _ in mds_list:
f.write('Target:'+str(target_attr) + '\t')
f.write(str(_))
f.write('\n')
with open(fn_vio_path, 'w') as f:
for _ in fn_vio_meta:
for i in _.keys():
f.write(i + ':' + str(_[i]) + '\t')
with open(vio_path, 'w') as f:
for _ in vio_list:
f.write('Target:'+str(target_attr) + '\t')
f.write(str(_))
f.write('\n')

@ -7,7 +7,7 @@ import time
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from settings import model, embedding_dict, er_output_dir
from settings import model, er_output_dir
from sentence_transformers.util import cos_sim
conf_thresh = 0.8
@ -91,7 +91,7 @@ def test_load():
# # print(sim.tolist()[0][0]/2 + 0.5)
def if_minimal(md, md_list, target_col):
def is_minimal(md, md_list, target_col):
# 假设这个md是minimal
minimal = True
if len(md_list) == 0:
@ -153,7 +153,7 @@ def inference_from_record_pairs(path, threshold, target_col):
# sims是两行的相似度
sims = {}
for col in columns:
similarity = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)])
similarity = norm_cos_sim(getattr(row1, col), getattr(row2, col))
sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表
@ -178,7 +178,7 @@ def inference_from_record_pairs(path, threshold, target_col):
# new_rhs = sims[target_col]
# spec_r_md = copy.deepcopy(vio_md)
# spec_r_md[target_col] = new_rhs
# if if_minimal(spec_r_md, md_list, target_col):
# if is_minimal(spec_r_md, md_list, target_col):
# md_list.append(spec_r_md)
# 特殊化左侧
@ -186,11 +186,11 @@ def inference_from_record_pairs(path, threshold, target_col):
if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if if_minimal(spec_l_md, md_list, target_col):
if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md)
# for vio in minimal_vio[:]:
# if not if_minimal(vio, md_list, target_col):
# if not is_minimal(vio, md_list, target_col):
# minimal_vio.remove(vio)
# fuck = len(minimal_vio)
@ -216,11 +216,11 @@ def inference_from_record_pairs(path, threshold, target_col):
# minimal_vio = list(proxy_minimal_vio)
#
# for _ in minimal_vio[:]:
# if not if_minimal(_, minimal_vio, target_col):
# if not is_minimal(_, minimal_vio, target_col):
# minimal_vio.remove(_)
#
for _ in md_list[:]:
if not if_minimal(_, md_list, target_col):
if not is_minimal(_, md_list, target_col):
md_list.remove(_)
return md_list, minimal_vio
@ -258,7 +258,7 @@ def get_one_md_metadata(md, dataframe, target_col):
left_satisfy = True
both_satisfy = True
for col in columns:
sim = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)])
sim = norm_cos_sim(getattr(row1, col), getattr(row2, col))
if col == target_col:
if sim + 0.0000001 < 1:
both_satisfy = False

@ -4,27 +4,18 @@ from concurrent.futures import ProcessPoolExecutor
from multiprocessing.managers import SharedMemoryManager
import numpy as np
import pandas
import pandas as pd
import Levenshtein
import copy
import torch
from tqdm import tqdm
from md_discovery.multi_process_infer_by_pairs import norm_cos_sim
from settings import embedding_dict, model
from settings import model, md_output_dir
conf_thresh = 0.8
def my_Levenshtein_ratio(str1, str2):
if max(len(str1), len(str2)) == 0:
return 1
return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
def if_minimal(md, md_list, target_col):
def is_minimal(md, md_list, target_col):
# 假设这个md是minimal
if len(md_list) == 0:
return True
@ -49,23 +40,7 @@ def if_minimal(md, md_list, target_col):
return minimal
def remove_by_confidence(md, md_list, relation, sim_tensor, target_col, lock):
support, confidence = get_one_md_metadata(md, relation, sim_tensor, target_col)
if confidence < 0.8:
with lock:
md_list.remove(md)
# def remove_by_confidence(md, l, relation, target_col):
# boolean, conf = satisfy_confidence(md, relation, 0.8, target_col)
# if not boolean:
# l.remove(md)
# print(md, '\t', conf)
# def build_sim_matrix():
# width
# return 0
def inference_from_record_pairs(path, threshold, target_col):
def pairs_inference(path, threshold, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
@ -87,7 +62,7 @@ def inference_from_record_pairs(path, threshold, target_col):
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor/2 + 0.5
torch.save(sim_tensor, "E:\\Data\\Research\\Projects\\matching_dependency\\tensor.pt")
torch.save(sim_tensor, md_output_dir + "tensor.pt")
md_list = []
minimal_vio = []
@ -108,11 +83,10 @@ def inference_from_record_pairs(path, threshold, target_col):
sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表
# tmp_md_list = copy.deepcopy(md_list)
for md in md_list[:]:
lhs_satis = True
rhs_satis = True
for col in list(set(columns) - {target_col}):
for col in cols_but_target:
if sims[col] < md[col]:
lhs_satis = False
break
@ -123,32 +97,23 @@ def inference_from_record_pairs(path, threshold, target_col):
violated_mds.append(md)
for vio_md in violated_mds:
# 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值
# if sims[target_col] >= threshold:
# new_rhs = sims[target_col]
# spec_r_md = copy.deepcopy(vio_md)
# spec_r_md[target_col] = new_rhs
# if if_minimal(spec_r_md, md_list, target_col):
# md_list.append(spec_r_md)
# 特殊化左侧
for col in list(set(columns) - {target_col}):
for col in cols_but_target:
if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if if_minimal(spec_l_md, md_list, target_col):
if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md)
if vio_md not in minimal_vio:
minimal_vio.append(vio_md)
if len(md_list) == 0:
terminate = True
break
# tmp_minimal_vio = copy.deepcopy(minimal_vio)
if terminate:
break
if len(md_list) > 0:
for vio in minimal_vio[:]:
if not if_minimal(vio, md_list, target_col):
if not is_minimal(vio, md_list, target_col):
minimal_vio.remove(vio)
print('mds_list\t', len(md_list), '\n')
@ -157,64 +122,23 @@ def inference_from_record_pairs(path, threshold, target_col):
if len(minimal_vio) == 0:
return md_list, []
# manager = multiprocessing.Manager()
# lock = manager.Lock()
# pool_size = 4
# pool = multiprocessing.Pool(pool_size)
# with manager:
# proxy_minimal_vio = manager.list(minimal_vio)
# for _ in minimal_vio[:]:
# pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, sim_tensor, target_col, lock))
# pool.close()
# pool.join()
# minimal_vio = list(proxy_minimal_vio)
# minimal_vio.reverse()
i = 0
remove_list = []
fuck = []
# fuck = []
for md in minimal_vio:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
fuck.append((support, confidence))
# fuck.append((support, confidence))
if support < 1:
print('delete by support')
remove_list.append(md)
if confidence < 0.8:
if confidence < 0.5:
print('delete by confidence')
remove_list.append(md)
fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
# while i < len(minimal_vio):
# print('vio_index\t', i)
# print('vio_length', len(minimal_vio))
# current_md = minimal_vio[i]
# support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index)
# # if support < 50:
# # minimal_vio_length = len(minimal_vio)
# # j = i + 1
# # while j < len(minimal_vio):
# # specialization = True
# # next_md = minimal_vio[j]
# # for col in cols_but_target:
# # if current_md[col] > next_md[col]:
# # specialization = False
# # break
# # if specialization:
# # minimal_vio.remove(next_md)
# # else:
# # j += 1
# # print('sup')
# # minimal_vio.remove(current_md)
# if support < 1:
# print('delete by support')
# minimal_vio.remove(current_md)
# if confidence < 0.8:
# print('delete by confidence')
# minimal_vio.remove(current_md)
# if support >= 1 and confidence >= 0.8:
# i += 1
# fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
for _ in remove_list:
minimal_vio.remove(_)
for _ in minimal_vio[:]:
if not if_minimal(_, minimal_vio, target_col):
if not is_minimal(_, minimal_vio, target_col):
minimal_vio.remove(_)
print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')

@ -1,33 +1,35 @@
import json
import os
import sys
import ConfigSpace
import pandas
import torch
from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
import six
from ConfigSpace import Configuration
from md_discovery.multi_process_infer_by_pairs import my_Levenshtein_ratio, norm_cos_sim
from settings import *
def process_prediction_for_md_discovery(pred: pd.DataFrame,
tp_single_tuple_path: str = er_output_dir + "tp_single_tuple.csv",
fn_single_tuple_path: str = er_output_dir + "fn_single_tuple.csv"):
t_single_tuple_path: str = er_output_dir + "t_single_tuple.csv"):
# 提取预测表中真阳和假阴部分
tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
# 拼成一张表
df = pd.concat([tp, fn])
# 将真阳/假阴表中左右ID调整一致
for index, row in tp.iterrows():
tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
for index, row in fn.iterrows():
fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
for index, row in df.iterrows():
df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
pred_columns = pred.columns.values.tolist()
l_columns = []
r_columns = []
columns = []
cols = []
# 将预测表中左表和右表字段名分别加入两个列表
for _ in pred_columns:
if _.startswith('ltable'):
@ -36,25 +38,15 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame,
r_columns.append(_)
# 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致)
for _ in l_columns:
columns.append(_.replace('ltable_', ''))
# 将表拆分成左右两部分
tpl = tp[l_columns]
tpr = tp[r_columns]
# 将左右两部分字段名统一
tpl.columns = columns
tpr.columns = columns
cols.append(_.replace('ltable_', ''))
fnl = fn[l_columns]
fnr = fn[r_columns]
fnl.columns = columns
fnr.columns = columns
ldf = df[l_columns]
rdf = df[r_columns]
ldf.columns = cols
rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf])
tp_single_tuple = pd.concat([tpl, tpr])
fn_single_tuple = pd.concat([fnl, fnr])
tp_single_tuple.to_csv(tp_single_tuple_path, sep=',', index=False, header=True)
fn_single_tuple.to_csv(fn_single_tuple_path, sep=',', index=False, header=True)
t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True)
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@ -100,43 +92,57 @@ def load_mds(paths: list) -> list:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
md_metadata = line.strip().split('\t')
md = eval(md_metadata[0].replace('md:', ''))
confidence = eval(md_metadata[2].replace('confidence:', ''))
if confidence > 0:
mds.append(md)
# todo 如果MD文件的形式改了 这里也要改
md = eval(md_metadata[1])
mds.append(md)
all_mds.extend(mds)
return all_mds
def is_explicable(row, all_mds: list) -> bool:
def is_explicable(row, all_mds: list, st_dict) -> bool:
attrs = all_mds[0].keys() # 从第一条md中读取所有字段
for md in all_mds:
explicable = True # 假设这条md能解释当前元组
for a in attrs:
threshold = md[a]
if norm_cos_sim(embedding_dict[str(getattr(row, 'ltable_'+a))],
embedding_dict[str(getattr(row, 'rtable_'+a))]) < threshold:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if a != target_attr:
if st_dict[a][row[0]].item() < md[a]:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return True # 任意一条md能解释直接返回
return False # 遍历结束,不能解释
def load_data(left_path: str, right_path: str, mapping_path: str):
left = pd.read_csv(left_path, encoding='ISO-8859-1')
cm.set_key(left, left.columns.values.tolist()[0])
left.fillna("", inplace=True)
left = left.astype(str)
right = pd.read_csv(right_path, encoding='ISO-8859-1')
cm.set_key(right, right.columns.values.tolist()[0])
right.fillna("", inplace=True)
right = right.astype(str)
mapping = pd.read_csv(mapping_path)
mapping = mapping.astype(str)
return left, right, mapping
def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
predictions_attrs = predictions.columns.values.tolist()
col_tuple_list = []
for _ in predictions_attrs:
if _.startswith('ltable'):
left_index = predictions_attrs.index(_)
right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
col_tuple_list.append((left_index, right_index))
length = predictions.shape[0]
width = predictions.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = predictions.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
# prediction的归一化嵌入张量
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor_dict = {}
for col_tuple in col_tuple_list:
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = sim_tensor / 2 + 0.5
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
def ml_er(iter_round: int, config: Configuration = None, ):
@ -277,30 +283,27 @@ def ml_er(iter_round: int, config: Configuration = None, ):
predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs]
process_prediction_for_md_discovery(predictions)
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt',
md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt']
epl_match = 0 # 可解释预测match
nepl_mismatch = 0 # 不可解释预测mismatch
md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
if len(md_list) > 0:
for row in predictions.itertuples():
if is_explicable(row, md_list):
if getattr(row, 'predicted') == 1:
epl_match += 1
else:
if getattr(row, 'predicted') == 0:
nepl_mismatch += 1
interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
if is_explicable(row, md_list, sim_tensor_dict) and str(getattr(row, 'predicted')) == str(1):
epl_match += 1
df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性
if indicators["block_recall"] >= 0.8:
f1 = indicators["F1"]
else:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
################################################################################################################
process_prediction_for_md_discovery(predictions)
output_path = er_output_dir + "eval_result_" + str(iter_round) + ".txt"
with open(output_path, 'w') as f:
@ -313,4 +316,20 @@ def ml_er(iter_round: int, config: Configuration = None, ):
if __name__ == '__main__':
ml_er(1)
iterations = 1
filename_list = os.listdir(er_output_dir)
if len(filename_list) > 0:
for _ in filename_list:
if _.startswith('eval_result'):
iterations = int(_[12:13]) + 1
if iterations > 1:
incumbent_array = np.load(hpo_output_dir + 'incumbent.npy')
with open(hpo_output_dir + "configspace.json", 'r') as f:
dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
configuration = ConfigSpace.Configuration(configspace, vector=incumbent_array)
ml_er(iterations, configuration)
else:
ml_er(1)

@ -16,6 +16,5 @@ confidence_threshold = 0.8
interpre_weight = 0.3 # 可解释性权重
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
model = SentenceTransformer('E:\\Data\\Research\\Models\\paraphrase-MiniLM-L6-v2')
embedding_dict = np.load('E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\embedding_dic.npy',
allow_pickle=True).item()

@ -1,13 +1,16 @@
import json
import multiprocessing
import os
import time
import ConfigSpace
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs
from ConfigSpace.read_and_write import json as csj
from md_discovery import tmp_discover
from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict
from settings import er_output_dir, similarity_threshold, target_attr, hpo_output_dir
def fuck(i):
@ -64,9 +67,60 @@ def test4():
print(torch.count_nonzero(one_bool_tensor).item())
def test5():
ten1 = torch.tensor([[1, 2, 3],
[7, 8, 9]])
ten2 = torch.tensor([[4, 5, 6],
[11, 12, 15]])
result = ten1 * ten2
r = torch.sum(result, 1)
print('\n')
print(result)
print(r)
def test6():
table_tensor = torch.tensor([[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]],
[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]]])
t = torch.tensor([[1., 2., 3.],
[4., 5., 6.]])
norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
print('\n')
print(norm1)
print(norm2)
print(t.shape)
def test7():
iterations = 1
filename_list = os.listdir(er_output_dir)
if len(filename_list) > 0:
for _ in filename_list:
if _.startswith('eval_result'):
iterations = int(_[12:13]) + 1
print(iterations)
def test8():
cum = np.load(hpo_output_dir + 'incumbent.npy')
with open(hpo_output_dir + "configspace.json", 'r') as load_f:
dict_configspace = json.load(load_f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
config = ConfigSpace.Configuration(configspace, vector=cum)
print(cum)
if __name__ == '__main__':
start = time.time()
tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv"
t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
# tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
tp_mds, tp_vio = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr)
print(time.time() - start)

Loading…
Cancel
Save