1.所有相似度计算依靠GPU

2.迭代轮数自动识别
3.超参数优化结果落盘
HuangJintao
HuangJintao 1 year ago
parent 20c33c0fd8
commit 882c25d20f

3
.gitignore vendored

@ -1,2 +1,5 @@
/deprecated/ /deprecated/
/datasets/ /datasets/
/ml_er/output/*
/md_discovery/output/*
/hpo/output/*

@ -1,14 +1,18 @@
import os import os
import numpy as np
import torch
import json
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
from ConfigSpace.conditions import InCondition from ConfigSpace.conditions import InCondition
from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario from smac import HyperparameterOptimizationFacade, Scenario
from settings import * from settings import *
from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable, build_col_pairs_sim_tensor_dict
# 数据在外部加载 # 数据在外部加载
######################################################################################################################## ########################################################################################################################
@ -171,23 +175,23 @@ class Classifier:
predictions_attrs.extend(attrs_with_r_prefix) predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted']) predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs] predictions = predictions[predictions_attrs]
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
# 默认路径为 "../md_discovery/output/xxx.txt" # 默认路径为 "../md_discovery/output/xxx.txt"
# 真阳/假阴 mds/vio 共4个md文件 # mds/vio 共2个md文件
md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt', md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt']
epl_match = 0 # 可解释预测match
nepl_mismatch = 0 # 不可解释预测mismatch
md_list = load_mds(md_paths) # 从全局变量中读取所有的md md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
if len(md_list) > 0: if len(md_list) > 0:
for line in predictions.itertuples(): for line in predictions.itertuples():
if is_explicable(line, md_list): if is_explicable(line, md_list, sim_tensor_dict) and str(getattr(line, 'predicted')) == str(1):
if getattr(line, 'predicted') == 1: epl_match += 1
epl_match += 1
else: ppre = predictions[predictions['predicted'] == str(1)]
if getattr(line, 'predicted') == 0: interpretability = epl_match / len(ppre) # 可解释性
nepl_mismatch += 1 # todo block_recall可以考虑以下注释
interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
# if indicators["block_recall"] >= 0.8: # if indicators["block_recall"] >= 0.8:
# f1 = indicators["F1"] # f1 = indicators["F1"]
# else: # else:
@ -201,10 +205,15 @@ class Classifier:
def ml_er_hpo(): def ml_er_hpo():
classifier = Classifier() classifier = Classifier()
cs = classifier.configspace
str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace)
with open(hpo_output_dir + "configspace.json", "w") as f:
json.dump(dict_configspace, f)
# Next, we create an object, holding general information about the run # Next, we create an object, holding general information about the run
scenario = Scenario( scenario = Scenario(
classifier.configspace, cs,
deterministic=True, deterministic=True,
n_trials=10, # We want to run max 50 trials (combination of config and seed) n_trials=10, # We want to run max 50 trials (combination of config and seed)
n_workers=1 n_workers=1
@ -221,9 +230,11 @@ def ml_er_hpo():
) )
incumbent = smac.optimize() incumbent = smac.optimize()
incumbent_ndarray = incumbent.get_array()
np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray)
# Get cost of default configuration # Get cost of default configuration
default_cost = smac.validate(classifier.configspace.get_default_configuration()) default_cost = smac.validate(cs.get_default_configuration())
print(f"Default cost: {default_cost}") print(f"Default cost: {default_cost}")
# Let's calculate the cost of the incumbent # Let's calculate the cost of the incumbent
@ -235,4 +246,4 @@ def ml_er_hpo():
if __name__ == '__main__': if __name__ == '__main__':
print(1) ml_er_hpo()

@ -1,5 +1,4 @@
from md_discovery.multi_process_infer_by_pairs import inference_from_record_pairs from md_discovery import tmp_discover
from md_discovery.multi_process_infer_by_pairs import get_mds_metadata
from settings import * from settings import *
# # 若不输出support和confidence使用以下两块代码 # # 若不输出support和confidence使用以下两块代码
@ -18,51 +17,25 @@ from settings import *
def md_discover(): def md_discover():
# 目前可以仿照这个main函数写 # 目前可以仿照这个main函数写
tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv" t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
fn_single_tuple_path = er_output_dir + "fn_single_tuple.csv"
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值
# 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 mds_list, vio_list = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr)
tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
fn_mds, fn_vio = inference_from_record_pairs(fn_single_tuple_path, similarity_threshold, target_attr)
# 如果不需要输出support和confidence去掉下面两行
tp_mds_meta = get_mds_metadata(tp_mds, tp_single_tuple_path, target_attr)
tp_vio_meta = get_mds_metadata(tp_vio, tp_single_tuple_path, target_attr)
fn_mds_meta = get_mds_metadata(fn_mds, fn_single_tuple_path, target_attr)
fn_vio_meta = get_mds_metadata(fn_vio, fn_single_tuple_path, target_attr)
# 若输出support和confidence使用以下两块代码
# 将列表1写入本地路径需自己修改 # 将列表1写入本地路径需自己修改
tp_mds_path = md_output_dir + "tp_mds.txt" mds_path = md_output_dir + "mds.txt"
tp_vio_path = md_output_dir + "tp_vio.txt" vio_path = md_output_dir + "vio.txt"
with open(tp_mds_path, 'w') as f:
for _ in tp_mds_meta:
for i in _.keys():
f.write(i + ':' + str(_[i]) + '\t')
f.write('\n')
with open(tp_vio_path, 'w') as f:
for _ in tp_vio_meta:
for i in _.keys():
f.write(i + ':' + str(_[i]) + '\t')
f.write('\n')
fn_mds_path = md_output_dir + "fn_mds.txt"
fn_vio_path = md_output_dir + "fn_vio.txt"
with open(fn_mds_path, 'w') as f: with open(mds_path, 'w') as f:
for _ in fn_mds_meta: for _ in mds_list:
for i in _.keys(): f.write('Target:'+str(target_attr) + '\t')
f.write(i + ':' + str(_[i]) + '\t') f.write(str(_))
f.write('\n') f.write('\n')
with open(fn_vio_path, 'w') as f: with open(vio_path, 'w') as f:
for _ in fn_vio_meta: for _ in vio_list:
for i in _.keys(): f.write('Target:'+str(target_attr) + '\t')
f.write(i + ':' + str(_[i]) + '\t') f.write(str(_))
f.write('\n') f.write('\n')

@ -7,7 +7,7 @@ import time
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel from transformers import AutoTokenizer, AutoModel
from settings import model, embedding_dict, er_output_dir from settings import model, er_output_dir
from sentence_transformers.util import cos_sim from sentence_transformers.util import cos_sim
conf_thresh = 0.8 conf_thresh = 0.8
@ -91,7 +91,7 @@ def test_load():
# # print(sim.tolist()[0][0]/2 + 0.5) # # print(sim.tolist()[0][0]/2 + 0.5)
def if_minimal(md, md_list, target_col): def is_minimal(md, md_list, target_col):
# 假设这个md是minimal # 假设这个md是minimal
minimal = True minimal = True
if len(md_list) == 0: if len(md_list) == 0:
@ -153,7 +153,7 @@ def inference_from_record_pairs(path, threshold, target_col):
# sims是两行的相似度 # sims是两行的相似度
sims = {} sims = {}
for col in columns: for col in columns:
similarity = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)]) similarity = norm_cos_sim(getattr(row1, col), getattr(row2, col))
sims[col] = similarity sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表 # 寻找violated md,从md列表中删除并加入vio列表
@ -178,7 +178,7 @@ def inference_from_record_pairs(path, threshold, target_col):
# new_rhs = sims[target_col] # new_rhs = sims[target_col]
# spec_r_md = copy.deepcopy(vio_md) # spec_r_md = copy.deepcopy(vio_md)
# spec_r_md[target_col] = new_rhs # spec_r_md[target_col] = new_rhs
# if if_minimal(spec_r_md, md_list, target_col): # if is_minimal(spec_r_md, md_list, target_col):
# md_list.append(spec_r_md) # md_list.append(spec_r_md)
# 特殊化左侧 # 特殊化左侧
@ -186,11 +186,11 @@ def inference_from_record_pairs(path, threshold, target_col):
if sims[col] + 0.01 <= 1: if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md) spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if if_minimal(spec_l_md, md_list, target_col): if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md) md_list.append(spec_l_md)
# for vio in minimal_vio[:]: # for vio in minimal_vio[:]:
# if not if_minimal(vio, md_list, target_col): # if not is_minimal(vio, md_list, target_col):
# minimal_vio.remove(vio) # minimal_vio.remove(vio)
# fuck = len(minimal_vio) # fuck = len(minimal_vio)
@ -216,11 +216,11 @@ def inference_from_record_pairs(path, threshold, target_col):
# minimal_vio = list(proxy_minimal_vio) # minimal_vio = list(proxy_minimal_vio)
# #
# for _ in minimal_vio[:]: # for _ in minimal_vio[:]:
# if not if_minimal(_, minimal_vio, target_col): # if not is_minimal(_, minimal_vio, target_col):
# minimal_vio.remove(_) # minimal_vio.remove(_)
# #
for _ in md_list[:]: for _ in md_list[:]:
if not if_minimal(_, md_list, target_col): if not is_minimal(_, md_list, target_col):
md_list.remove(_) md_list.remove(_)
return md_list, minimal_vio return md_list, minimal_vio
@ -258,7 +258,7 @@ def get_one_md_metadata(md, dataframe, target_col):
left_satisfy = True left_satisfy = True
both_satisfy = True both_satisfy = True
for col in columns: for col in columns:
sim = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)]) sim = norm_cos_sim(getattr(row1, col), getattr(row2, col))
if col == target_col: if col == target_col:
if sim + 0.0000001 < 1: if sim + 0.0000001 < 1:
both_satisfy = False both_satisfy = False

@ -4,27 +4,18 @@ from concurrent.futures import ProcessPoolExecutor
from multiprocessing.managers import SharedMemoryManager from multiprocessing.managers import SharedMemoryManager
import numpy as np import numpy as np
import pandas
import pandas as pd import pandas as pd
import Levenshtein
import copy import copy
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from md_discovery.multi_process_infer_by_pairs import norm_cos_sim from settings import model, md_output_dir
from settings import embedding_dict, model
conf_thresh = 0.8 conf_thresh = 0.8
def my_Levenshtein_ratio(str1, str2): def is_minimal(md, md_list, target_col):
if max(len(str1), len(str2)) == 0:
return 1
return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
def if_minimal(md, md_list, target_col):
# 假设这个md是minimal # 假设这个md是minimal
if len(md_list) == 0: if len(md_list) == 0:
return True return True
@ -49,23 +40,7 @@ def if_minimal(md, md_list, target_col):
return minimal return minimal
def remove_by_confidence(md, md_list, relation, sim_tensor, target_col, lock): def pairs_inference(path, threshold, target_col):
support, confidence = get_one_md_metadata(md, relation, sim_tensor, target_col)
if confidence < 0.8:
with lock:
md_list.remove(md)
# def remove_by_confidence(md, l, relation, target_col):
# boolean, conf = satisfy_confidence(md, relation, 0.8, target_col)
# if not boolean:
# l.remove(md)
# print(md, '\t', conf)
# def build_sim_matrix():
# width
# return 0
def inference_from_record_pairs(path, threshold, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True) data.fillna("", inplace=True)
data = data.astype(str) data = data.astype(str)
@ -87,7 +62,7 @@ def inference_from_record_pairs(path, threshold, target_col):
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor/2 + 0.5 sim_tensor = sim_tensor/2 + 0.5
torch.save(sim_tensor, "E:\\Data\\Research\\Projects\\matching_dependency\\tensor.pt") torch.save(sim_tensor, md_output_dir + "tensor.pt")
md_list = [] md_list = []
minimal_vio = [] minimal_vio = []
@ -108,11 +83,10 @@ def inference_from_record_pairs(path, threshold, target_col):
sims[col] = similarity sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表 # 寻找violated md,从md列表中删除并加入vio列表
# tmp_md_list = copy.deepcopy(md_list)
for md in md_list[:]: for md in md_list[:]:
lhs_satis = True lhs_satis = True
rhs_satis = True rhs_satis = True
for col in list(set(columns) - {target_col}): for col in cols_but_target:
if sims[col] < md[col]: if sims[col] < md[col]:
lhs_satis = False lhs_satis = False
break break
@ -123,32 +97,23 @@ def inference_from_record_pairs(path, threshold, target_col):
violated_mds.append(md) violated_mds.append(md)
for vio_md in violated_mds: for vio_md in violated_mds:
# 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值
# if sims[target_col] >= threshold:
# new_rhs = sims[target_col]
# spec_r_md = copy.deepcopy(vio_md)
# spec_r_md[target_col] = new_rhs
# if if_minimal(spec_r_md, md_list, target_col):
# md_list.append(spec_r_md)
# 特殊化左侧 # 特殊化左侧
for col in list(set(columns) - {target_col}): for col in cols_but_target:
if sims[col] + 0.01 <= 1: if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md) spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if if_minimal(spec_l_md, md_list, target_col): if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md) md_list.append(spec_l_md)
if vio_md not in minimal_vio: if vio_md not in minimal_vio:
minimal_vio.append(vio_md) minimal_vio.append(vio_md)
if len(md_list) == 0: if len(md_list) == 0:
terminate = True terminate = True
break break
# tmp_minimal_vio = copy.deepcopy(minimal_vio)
if terminate: if terminate:
break break
if len(md_list) > 0: if len(md_list) > 0:
for vio in minimal_vio[:]: for vio in minimal_vio[:]:
if not if_minimal(vio, md_list, target_col): if not is_minimal(vio, md_list, target_col):
minimal_vio.remove(vio) minimal_vio.remove(vio)
print('mds_list\t', len(md_list), '\n') print('mds_list\t', len(md_list), '\n')
@ -157,64 +122,23 @@ def inference_from_record_pairs(path, threshold, target_col):
if len(minimal_vio) == 0: if len(minimal_vio) == 0:
return md_list, [] return md_list, []
# manager = multiprocessing.Manager()
# lock = manager.Lock()
# pool_size = 4
# pool = multiprocessing.Pool(pool_size)
# with manager:
# proxy_minimal_vio = manager.list(minimal_vio)
# for _ in minimal_vio[:]:
# pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, sim_tensor, target_col, lock))
# pool.close()
# pool.join()
# minimal_vio = list(proxy_minimal_vio)
# minimal_vio.reverse()
i = 0
remove_list = [] remove_list = []
fuck = [] # fuck = []
for md in minimal_vio: for md in minimal_vio:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
fuck.append((support, confidence)) # fuck.append((support, confidence))
if support < 1: if support < 1:
print('delete by support') print('delete by support')
remove_list.append(md) remove_list.append(md)
if confidence < 0.8: if confidence < 0.5:
print('delete by confidence') print('delete by confidence')
remove_list.append(md) remove_list.append(md)
fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True) # fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
# while i < len(minimal_vio): for _ in remove_list:
# print('vio_index\t', i) minimal_vio.remove(_)
# print('vio_length', len(minimal_vio))
# current_md = minimal_vio[i]
# support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index)
# # if support < 50:
# # minimal_vio_length = len(minimal_vio)
# # j = i + 1
# # while j < len(minimal_vio):
# # specialization = True
# # next_md = minimal_vio[j]
# # for col in cols_but_target:
# # if current_md[col] > next_md[col]:
# # specialization = False
# # break
# # if specialization:
# # minimal_vio.remove(next_md)
# # else:
# # j += 1
# # print('sup')
# # minimal_vio.remove(current_md)
# if support < 1:
# print('delete by support')
# minimal_vio.remove(current_md)
# if confidence < 0.8:
# print('delete by confidence')
# minimal_vio.remove(current_md)
# if support >= 1 and confidence >= 0.8:
# i += 1
for _ in minimal_vio[:]: for _ in minimal_vio[:]:
if not if_minimal(_, minimal_vio, target_col): if not is_minimal(_, minimal_vio, target_col):
minimal_vio.remove(_) minimal_vio.remove(_)
print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m') print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')

@ -1,33 +1,35 @@
import json
import os import os
import sys import sys
import ConfigSpace
import pandas
import torch
from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd import pandas as pd
import six import six
from ConfigSpace import Configuration from ConfigSpace import Configuration
from md_discovery.multi_process_infer_by_pairs import my_Levenshtein_ratio, norm_cos_sim
from settings import * from settings import *
def process_prediction_for_md_discovery(pred: pd.DataFrame, def process_prediction_for_md_discovery(pred: pd.DataFrame,
tp_single_tuple_path: str = er_output_dir + "tp_single_tuple.csv", t_single_tuple_path: str = er_output_dir + "t_single_tuple.csv"):
fn_single_tuple_path: str = er_output_dir + "fn_single_tuple.csv"):
# 提取预测表中真阳和假阴部分 # 提取预测表中真阳和假阴部分
tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)] tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)] fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
# 拼成一张表
df = pd.concat([tp, fn])
# 将真阳/假阴表中左右ID调整一致 # 将真阳/假阴表中左右ID调整一致
for index, row in tp.iterrows(): for index, row in df.iterrows():
tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
for index, row in fn.iterrows():
fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
pred_columns = pred.columns.values.tolist() pred_columns = pred.columns.values.tolist()
l_columns = [] l_columns = []
r_columns = [] r_columns = []
columns = [] cols = []
# 将预测表中左表和右表字段名分别加入两个列表 # 将预测表中左表和右表字段名分别加入两个列表
for _ in pred_columns: for _ in pred_columns:
if _.startswith('ltable'): if _.startswith('ltable'):
@ -36,25 +38,15 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame,
r_columns.append(_) r_columns.append(_)
# 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致) # 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致)
for _ in l_columns: for _ in l_columns:
columns.append(_.replace('ltable_', '')) cols.append(_.replace('ltable_', ''))
# 将表拆分成左右两部分
tpl = tp[l_columns]
tpr = tp[r_columns]
# 将左右两部分字段名统一
tpl.columns = columns
tpr.columns = columns
fnl = fn[l_columns] ldf = df[l_columns]
fnr = fn[r_columns] rdf = df[r_columns]
fnl.columns = columns ldf.columns = cols
fnr.columns = columns rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf])
tp_single_tuple = pd.concat([tpl, tpr]) t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True)
fn_single_tuple = pd.concat([fnl, fnr])
tp_single_tuple.to_csv(tp_single_tuple_path, sep=',', index=False, header=True)
fn_single_tuple.to_csv(fn_single_tuple_path, sep=',', index=False, header=True)
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@ -100,43 +92,57 @@ def load_mds(paths: list) -> list:
# 读取每一行的md加入该文件的md列表 # 读取每一行的md加入该文件的md列表
for line in f.readlines(): for line in f.readlines():
md_metadata = line.strip().split('\t') md_metadata = line.strip().split('\t')
md = eval(md_metadata[0].replace('md:', '')) # todo 如果MD文件的形式改了 这里也要改
confidence = eval(md_metadata[2].replace('confidence:', '')) md = eval(md_metadata[1])
if confidence > 0: mds.append(md)
mds.append(md)
all_mds.extend(mds) all_mds.extend(mds)
return all_mds return all_mds
def is_explicable(row, all_mds: list) -> bool: def is_explicable(row, all_mds: list, st_dict) -> bool:
attrs = all_mds[0].keys() # 从第一条md中读取所有字段 attrs = all_mds[0].keys() # 从第一条md中读取所有字段
for md in all_mds: for md in all_mds:
explicable = True # 假设这条md能解释当前元组 explicable = True # 假设这条md能解释当前元组
for a in attrs: for a in attrs:
threshold = md[a] if a != target_attr:
if norm_cos_sim(embedding_dict[str(getattr(row, 'ltable_'+a))], if st_dict[a][row[0]].item() < md[a]:
embedding_dict[str(getattr(row, 'rtable_'+a))]) < threshold: explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组 break # 不再与当前md的其他相似度阈值比较跳转到下一条md
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable: if explicable:
return True # 任意一条md能解释直接返回 return True # 任意一条md能解释直接返回
return False # 遍历结束,不能解释 return False # 遍历结束,不能解释
def load_data(left_path: str, right_path: str, mapping_path: str): def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
left = pd.read_csv(left_path, encoding='ISO-8859-1') predictions_attrs = predictions.columns.values.tolist()
cm.set_key(left, left.columns.values.tolist()[0]) col_tuple_list = []
left.fillna("", inplace=True) for _ in predictions_attrs:
left = left.astype(str) if _.startswith('ltable'):
left_index = predictions_attrs.index(_)
right = pd.read_csv(right_path, encoding='ISO-8859-1') right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
cm.set_key(right, right.columns.values.tolist()[0]) col_tuple_list.append((left_index, right_index))
right.fillna("", inplace=True)
right = right.astype(str) length = predictions.shape[0]
width = predictions.shape[1]
mapping = pd.read_csv(mapping_path) sentences = []
mapping = mapping.astype(str) for col in range(0, width):
return left, right, mapping for row in range(0, length):
cell_value = predictions.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
# prediction的归一化嵌入张量
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor_dict = {}
for col_tuple in col_tuple_list:
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = sim_tensor / 2 + 0.5
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
def ml_er(iter_round: int, config: Configuration = None, ): def ml_er(iter_round: int, config: Configuration = None, ):
@ -277,30 +283,27 @@ def ml_er(iter_round: int, config: Configuration = None, ):
predictions_attrs.extend(attrs_with_r_prefix) predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted']) predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs] predictions = predictions[predictions_attrs]
process_prediction_for_md_discovery(predictions)
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt', md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt']
epl_match = 0 # 可解释预测match
nepl_mismatch = 0 # 不可解释预测mismatch
md_list = load_mds(md_paths) # 从全局变量中读取所有的md md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
if len(md_list) > 0: if len(md_list) > 0:
for row in predictions.itertuples(): for row in predictions.itertuples():
if is_explicable(row, md_list): if is_explicable(row, md_list, sim_tensor_dict) and str(getattr(row, 'predicted')) == str(1):
if getattr(row, 'predicted') == 1: epl_match += 1
epl_match += 1
else: df = predictions[predictions['predicted'] == str(1)]
if getattr(row, 'predicted') == 0: interpretability = epl_match / len(df) # 可解释性
nepl_mismatch += 1
interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
if indicators["block_recall"] >= 0.8: if indicators["block_recall"] >= 0.8:
f1 = indicators["F1"] f1 = indicators["F1"]
else: else:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"]) f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"])
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
################################################################################################################ ################################################################################################################
process_prediction_for_md_discovery(predictions)
output_path = er_output_dir + "eval_result_" + str(iter_round) + ".txt" output_path = er_output_dir + "eval_result_" + str(iter_round) + ".txt"
with open(output_path, 'w') as f: with open(output_path, 'w') as f:
@ -313,4 +316,20 @@ def ml_er(iter_round: int, config: Configuration = None, ):
if __name__ == '__main__': if __name__ == '__main__':
ml_er(1) iterations = 1
filename_list = os.listdir(er_output_dir)
if len(filename_list) > 0:
for _ in filename_list:
if _.startswith('eval_result'):
iterations = int(_[12:13]) + 1
if iterations > 1:
incumbent_array = np.load(hpo_output_dir + 'incumbent.npy')
with open(hpo_output_dir + "configspace.json", 'r') as f:
dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
configuration = ConfigSpace.Configuration(configspace, vector=incumbent_array)
ml_er(iterations, configuration)
else:
ml_er(1)

@ -16,6 +16,5 @@ confidence_threshold = 0.8
interpre_weight = 0.3 # 可解释性权重 interpre_weight = 0.3 # 可解释性权重
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
model = SentenceTransformer('E:\\Data\\Research\\Models\\paraphrase-MiniLM-L6-v2') model = SentenceTransformer('E:\\Data\\Research\\Models\\paraphrase-MiniLM-L6-v2')
embedding_dict = np.load('E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\embedding_dic.npy',
allow_pickle=True).item()

@ -1,13 +1,16 @@
import json
import multiprocessing import multiprocessing
import os
import time import time
import ConfigSpace
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from ConfigSpace.read_and_write import json as csj
from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs
from md_discovery import tmp_discover from md_discovery import tmp_discover
from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict from settings import er_output_dir, similarity_threshold, target_attr, hpo_output_dir
def fuck(i): def fuck(i):
@ -64,9 +67,60 @@ def test4():
print(torch.count_nonzero(one_bool_tensor).item()) print(torch.count_nonzero(one_bool_tensor).item())
def test5():
ten1 = torch.tensor([[1, 2, 3],
[7, 8, 9]])
ten2 = torch.tensor([[4, 5, 6],
[11, 12, 15]])
result = ten1 * ten2
r = torch.sum(result, 1)
print('\n')
print(result)
print(r)
def test6():
table_tensor = torch.tensor([[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]],
[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]]])
t = torch.tensor([[1., 2., 3.],
[4., 5., 6.]])
norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
print('\n')
print(norm1)
print(norm2)
print(t.shape)
def test7():
iterations = 1
filename_list = os.listdir(er_output_dir)
if len(filename_list) > 0:
for _ in filename_list:
if _.startswith('eval_result'):
iterations = int(_[12:13]) + 1
print(iterations)
def test8():
cum = np.load(hpo_output_dir + 'incumbent.npy')
with open(hpo_output_dir + "configspace.json", 'r') as load_f:
dict_configspace = json.load(load_f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
config = ConfigSpace.Configuration(configspace, vector=cum)
print(cum)
if __name__ == '__main__': if __name__ == '__main__':
start = time.time() start = time.time()
tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv" t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
# tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) # tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) tp_mds, tp_vio = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr)
print(time.time() - start) print(time.time() - start)

Loading…
Cancel
Save