完成Magellan实验

MD-metrics-HPO
HuangJintao 6 months ago
parent f1e732afaf
commit fe5c1288ed

@ -1,73 +1,73 @@
# 将数据点和MD一起聚类
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from draw_md_cluster import DBSCAN
from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_):
clusterNum = len(set(labels_))
fig = plt.figure()
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
ax = fig.add_subplot(111, projection='3d')
for i in range(-1, clusterNum):
colorStyle = scatterColors[i % len(scatterColors)]
subCluster = md_data_[np.where(labels_ == i)]
ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x')
if pre_mismatch_points_.shape[0] > 0:
ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x')
ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度
ax.set_ylabel(md_keys_[1], rotation=-45)
ax.set_zlabel(md_keys_[2], rotation=0)
plt.title(output_path_.split('\\')[-1].split('.')[0])
plt.savefig(output_path_, dpi=500)
plt.show()
if __name__ == '__main__':
outcome_path = r'E:\Data\Research\Outcome'
config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
for dataset_name in dataset_name_list:
absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径
predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径
pred = pd.read_csv(predictions)
pred = pred.astype(str)
# pred = pred[pred['predicted'] == str(1)]
sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
# 选取的三个字段
md_keys = []
with open(absolute_path, 'r') as f:
# 读取每一行的md加入该文件的md列表
md_data = []
for line in f.readlines():
md_metadata = line.strip().split('\t')
md_tuple = eval(md_metadata[1])
md_keys = list(md_tuple[0].keys())[1:4]
md_values = list(md_tuple[0].values())
md_data.append(md_values[1:4])
if len(md_data) == 10000:
break
pre_match_points = []
pre_mismatch_points = []
for _ in pred.itertuples():
data_point_value = []
for key in md_keys:
sim_tensor = sim_tensor_dict[key]
data_point_value.append(round(float(sim_tensor[_[0]]), 4))
if getattr(_, 'predicted') == str(1):
pre_match_points.append(data_point_value)
elif getattr(_, 'predicted') == str(0):
pre_mismatch_points.append(data_point_value)
md_data = np.array(md_data, dtype=np.float32)
pre_match_points = np.array(pre_match_points, dtype=np.float32)
pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32)
labels = DBSCAN(md_data, 0.5, 30)
output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path)
# # 将数据点和MD一起聚类
# import os
# import numpy as np
# import pandas as pd
# from matplotlib import pyplot as plt
#
# from draw_md_cluster import DBSCAN
# from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
#
#
# def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_):
# clusterNum = len(set(labels_))
# fig = plt.figure()
# scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
# ax = fig.add_subplot(111, projection='3d')
# for i in range(-1, clusterNum):
# colorStyle = scatterColors[i % len(scatterColors)]
# subCluster = md_data_[np.where(labels_ == i)]
# ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
# ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x')
# if pre_mismatch_points_.shape[0] > 0:
# ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x')
# ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度
# ax.set_ylabel(md_keys_[1], rotation=-45)
# ax.set_zlabel(md_keys_[2], rotation=0)
# plt.title(output_path_.split('\\')[-1].split('.')[0])
# plt.savefig(output_path_, dpi=500)
# plt.show()
#
#
# if __name__ == '__main__':
# outcome_path = r'E:\Data\Research\Outcome'
# config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
# dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
# for dataset_name in dataset_name_list:
# absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径
# predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径
# pred = pd.read_csv(predictions)
# pred = pred.astype(str)
# # pred = pred[pred['predicted'] == str(1)]
# sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
# # 选取的三个字段
# md_keys = []
# with open(absolute_path, 'r') as f:
# # 读取每一行的md加入该文件的md列表
# md_data = []
# for line in f.readlines():
# md_metadata = line.strip().split('\t')
# md_tuple = eval(md_metadata[1])
# md_keys = list(md_tuple[0].keys())[1:4]
# md_values = list(md_tuple[0].values())
# md_data.append(md_values[1:4])
# if len(md_data) == 10000:
# break
#
# pre_match_points = []
# pre_mismatch_points = []
# for _ in pred.itertuples():
# data_point_value = []
# for key in md_keys:
# sim_tensor = sim_tensor_dict[key]
# data_point_value.append(round(float(sim_tensor[_[0]]), 4))
# if getattr(_, 'predicted') == str(1):
# pre_match_points.append(data_point_value)
# elif getattr(_, 'predicted') == str(0):
# pre_mismatch_points.append(data_point_value)
#
# md_data = np.array(md_data, dtype=np.float32)
# pre_match_points = np.array(pre_match_points, dtype=np.float32)
# pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32)
# labels = DBSCAN(md_data, 0.5, 30)
# output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
# plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path)

@ -1,12 +1,14 @@
import json
import pickle
import time
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario
from colorama import Fore, init
from smac import HyperparameterOptimizationFacade, Scenario, BlackBoxFacade
from ml_er.magellan_er import matching
from settings import *
@ -61,9 +63,7 @@ class Optimization:
def train(self, config: Configuration, seed: int = 0) -> float:
cm.del_catalog()
with open(er_output_dir + "blocking_result.pickle", "rb") as file:
blocking_result = pickle.load(file)
indicators = matching(config, blocking_result)
indicators = matching(config)
return 1 - indicators['performance']
@ -73,20 +73,20 @@ def ml_er_hpo():
str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace)
# 将超参数空间保存本地
with open(hpo_output_dir + "configspace.json", "w") as f:
with open(hpo_output_dir + r"\configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4)
scenario = Scenario(
cs,
crash_cost=1.0,
deterministic=True,
n_trials=20,
n_trials=16,
n_workers=1
)
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)
smac = HyperparameterOptimizationFacade(
smac = BlackBoxFacade(
scenario,
optimization.train,
initial_design=initial_design,
@ -97,19 +97,21 @@ def ml_er_hpo():
incumbent_cost = smac.validate(incumbent)
default = cs.get_default_configuration()
default_cost = smac.validate(default)
print(f"Default Cost: {default_cost}")
print(f"Incumbent Cost: {incumbent_cost}")
print(Fore.BLUE + f"Default Cost: {default_cost}")
print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")
if incumbent_cost > default_cost:
incumbent = default
print(f"Updated Incumbent Cost: {default_cost}")
print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')
print(f"Optimized Configuration:{incumbent.values()}")
print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")
with open(hpo_output_dir + "incumbent.json", "w") as f:
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
json.dump(dict(incumbent), f, indent=4)
return incumbent
if __name__ == '__main__':
init(autoreset=True)
print(Fore.CYAN + f'Start Time: {time.time()}')
ml_er_hpo()

@ -1,200 +0,0 @@
import operator
import numpy as np
import pandas as pd
import copy
import torch
from ConfigSpace import Configuration
from tqdm import tqdm
from settings import model, similarity_threshold, support_threshold, confidence_threshold
def is_minimal(md, md_list, target_col):
# 假设这个md是minimal
if len(md_list) == 0:
return True
minimal = True
for _ in md_list:
if isinstance(_, tuple):
_ = _[0]
if _ != md:
other_cols = list(set(_.keys()) - {target_col})
# 假设列表中每一个md都使当前md不minimal
exist = True
# 如果左边任何一个大于,则假设不成立
for col in other_cols:
if _[col] > md[col]:
exist = False
break
# 如果右边小于,假设也不成立
if _[target_col] < md[target_col]:
exist = False
# 任何一次假设成立当前md不minimal
if exist:
minimal = False
break
return minimal
def pairs_inference(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
# sim_tensor = torch.round(sim_tensor, decimals=3)
# torch.save(sim_tensor, md_output_dir + "tensor.pt")
md_list = []
minimal_vio = []
init_md = {}
for col in columns:
init_md[col] = 1 if col == target_col else -1
md_list.append(init_md)
for row1 in tqdm(range(0, length - 1)):
terminate = False
for row2 in range(row1 + 1, length):
violated_mds = []
# sims是两行的相似度
sims = {}
for col_index in range(0, width):
col = columns[col_index]
similarity = sim_tensor[col_index, row1, row2].item()
sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表
for md in md_list[:]:
lhs_satis = True
rhs_satis = True
for col in cols_but_target:
if sims[col] < md[col]:
lhs_satis = False
break
if sims[target_col] < md[target_col]:
rhs_satis = False
if lhs_satis == True and rhs_satis == False:
md_list.remove(md)
violated_mds.append(md)
# for vio_md in violated_mds:
# # 特殊化左侧
# for col in cols_but_target:
# if sims[col] + 0.01 <= 1:
# spec_l_md = copy.deepcopy(vio_md)
# spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
# if is_minimal(spec_l_md, md_list, target_col):
# md_list.append(spec_l_md)
# if vio_md not in minimal_vio:
# minimal_vio.append(vio_md)
for vio_md in violated_mds:
vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
if vio_md_support >= support_threshold:
for col in cols_but_target:
if sims[col] < 1.0:
spec_l_md = copy.deepcopy(vio_md)
if sims[col] < similarity_threshold:
spec_l_md[col] = similarity_threshold
else:
if sims[col] + 0.01 <= 1.0:
spec_l_md[col] = sims[col] + 0.01
else:
spec_l_md[col] = 1.0
if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md)
if vio_md not in minimal_vio:
minimal_vio.append(vio_md)
if len(md_list) == 0:
terminate = True
break
if terminate:
break
if len(minimal_vio) > 0:
for md in minimal_vio[:]:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
if support >= support_threshold and confidence >= confidence_threshold:
minimal_vio.append((md, support, confidence))
minimal_vio.remove(md)
if len(md_list) > 0:
# 去除重复MD
tmp = []
for _ in md_list:
if _ not in tmp:
tmp.append(_)
md_list = tmp
# 去除support小于阈值MD
for _ in md_list[:]:
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
if support >= support_threshold and confidence >= confidence_threshold:
md_list.append((_, support, confidence))
md_list.remove(_)
# 去除不minimal的MD
for md_tuple in md_list[:]:
if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5:
md_list.remove(md_tuple)
if len(minimal_vio) > 0:
for vio_tuple in minimal_vio[:]:
if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5:
minimal_vio.remove(vio_tuple)
if len(minimal_vio) > 0:
for vio_tuple in minimal_vio[:]:
if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5:
minimal_vio.remove(vio_tuple)
result = []
result.extend(md_list)
result.extend(minimal_vio)
result.sort(key=operator.itemgetter(2), reverse=True)
print(f'\033[33mList Length: {len(result)}\033[0m')
return result
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
columns = data.columns.values.tolist()
length = data.shape[0]
width = data.shape[1]
md_tensor = list(current_md.values())
md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = ini_slice.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence

@ -1,138 +0,0 @@
import math
import operator
import random
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
sample_number = 100000
step_length = 0.01
def get_metrics(md_tensor, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
# md_tensor = list(current_md.values())
# md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = conf_tensor.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence
def build_cartesian(width, target_index):
all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
num=math.ceil((1-similarity_threshold)/step_length) + 1)
all_values_array = np.round(all_values_array, 4)
all_values_tensor = torch.tensor(all_values_array, device='cuda')
all_values_tensor = all_values_tensor.float()
all_values_tensor = torch.round(all_values_tensor, decimals=4)
tensors_for_cartesian = []
for i in range(0, width):
if i == target_index:
t = torch.tensor([1.0], device='cuda')
tensors_for_cartesian.append(t)
else:
tensors_for_cartesian.append(all_values_tensor)
result = torch.cartesian_prod(*tensors_for_cartesian)
return result
def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
cartesian_product = cartesian_product.unsqueeze(2)
cartesian_product = cartesian_product.unsqueeze(3)
cartesian_product = cartesian_product.repeat(1, 1, length, length)
def discover(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
# 除了目标列外所有列的索引
columns_indices = [_ for _ in range(0, width) if _ != target_index]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor.float()
sim_tensor = torch.round(sim_tensor, decimals=4)
# 小于6列的可以尝试做笛卡尔积大于6列可能指数爆炸
if width < 6:
# 列出除目标列以外所有列的所有取值做笛卡尔积结果为所有可能MD取值
cartesian = build_cartesian(width, target_index)
# 抽取sample_number / (width - 1)条MD不含-1
if cartesian.shape[0] > sample_number / (width - 1):
index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
cartesian = torch.index_select(cartesian, 0, index)
else:
# 随机生成sample_number / (width - 1)条MD使用randint先转化为int再除成小数不含-1
cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
(math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
cartesian = cartesian / 100
# 生成一列相似度为1的目标列插入目标列所在位置
ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
cartesian = torch.round(cartesian, decimals=4)
# 此tensor将与其他置为-1的tensor拼接
joint_md_tensor = cartesian.clone()
# 随机将1列2列……置为-1
for i in range(width - 2):
index_list_format = []
for j in range(cartesian.shape[0]):
# 对每条MD随机选择将要置为-1的列索引
index_list_format.append(random.sample(columns_indices, i + 1))
index = torch.tensor(index_list_format, device='cuda')
# 随机调整为-1后的MD集合
modified_cartesian = cartesian.scatter(1, index, -1)
joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
md_list = []
# get_metric_tensor(cartesian, data, sim_tensor, target_index)
for _ in tqdm(range(joint_md_tensor.shape[0])):
s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
if s >= support_threshold and c >= confidence_threshold:
md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
md_dict_format = {}
for k in range(0, width):
md_dict_format[columns[k]] = md_list_format[k]
md_list.append((md_dict_format, s, c))
md_list.sort(key=operator.itemgetter(2), reverse=True)
return md_list

@ -1,27 +0,0 @@
from ConfigSpace import Configuration
from md_discovery.discovery_executor import pairs_inference
from md_discovery.discovery_executor_gpu import discover
from settings import *
# # 若不输出support和confidence使用以下两块代码
# # 将列表1写入本地路径需自己修改
# md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt'
# with open(md_path, 'w') as f:
# for _ in mds:
# f.write(str(_) + '\n')
#
# # 将列表2写入本地路径需自己修改
# vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt'
# with open(vio_path, 'w') as f:
# for _ in vio:
# f.write(str(_) + '\n')
def md_discover(config: Configuration, source_path, target_path):
mds_list = discover(source_path, target_attr)
with open(target_path, 'w') as f:
for _ in mds_list:
f.write('Target:'+str(target_attr) + '\t')
f.write(str(_))
f.write('\n')

@ -1,4 +1,5 @@
import itertools
import pickle
import random
import operator
from operator import itemgetter
@ -16,10 +17,12 @@ from settings import *
def mining(train: pd.DataFrame):
# data is train set, in which each row represents a tuple pair
train = train.astype(str)
# 将label列移到最后
train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1)
# 尝试不将左右表key手动调整相同而是只看gold属性是否为1
# 故将左右表key直接去除
data = train.drop(columns=['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id], inplace=False)
data = train.drop(columns=['_id', 'ltable_id', 'rtable_id'], inplace=False)
# data中现存属性除key以外左右表属性和gold, 不含_id
columns = data.columns.values.tolist()
columns_without_prefix = [_.replace('ltable_', '') for _ in columns if _.startswith('ltable_')]
@ -62,7 +65,7 @@ def mining(train: pd.DataFrame):
# 生成带标签的相似度张量
sim_table_tensor_labeled = torch.cat((sim_table_tensor, label_tensor), 1)
# 找到匹配元组对的行索引
mask = (data['gold'].isin(['1']))
mask = (data['label'].isin(['1']))
match_pair_indices = data[mask].index.tolist()
# 根据索引将匹配的行标签置为1
sim_table_tensor_labeled[match_pair_indices, -1] = 1.00
@ -209,7 +212,7 @@ def build_candidate_md_matrix(sorted_unique_value_tensor_list_: list):
def mds_to_txt(result_list_):
p = md_output_dir + "mds.txt"
p = md_output_dir + r"\mds.txt"
with open(p, 'w') as f:
for _ in result_list_:
f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}')
@ -253,3 +256,10 @@ def merge_mds(md_list_):
del same_sc_list[index]
# 二级列表转一级列表
return list(itertools.chain.from_iterable(grouped_md_tuples))
if __name__ == '__main__':
_train = pd.read_csv(directory_path + r'\train_whole.csv')
result = mining(_train)
with open(md_output_dir + r"\mds.pickle", "wb") as file_:
pickle.dump(result, file_)

@ -11,89 +11,20 @@ from ConfigSpace import Configuration
from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm
from tqdm import tqdm
from colorama import Fore
from md_discovery.md_mining import mining
from settings import *
def blocking_mining():
start = time.time()
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
matching_number = len(mappings)
# if ltable_id == rtable_id:
# tables_id = rtable_id
attributes = ltable.columns.values.tolist()
# lattributes = ['ltable_' + i for i in attributes]
# rattributes = ['rtable_' + i for i in attributes]
cm.set_key(ltable, ltable_id)
cm.set_key(rtable, rtable_id)
def matching(config: Configuration):
print(Fore.BLUE + f'Config: {config}')
with open(md_output_dir + r"\mds.pickle", "rb") as file:
md_list = pickle.load(file)
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
overlap_size=1, show_progress=False)
candidate['gold'] = 0
candidate = candidate.reset_index(drop=True)
block_time = time.time()
print(f'Block Time: {block_time - start}')
# 根据mapping表标注数据
candidate_match_rows = []
for t in tqdm(mappings.itertuples()):
mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, mapping_lid)])) &
(candidate['rtable_' + rtable_id].isin([getattr(t, mapping_rid)])))
matching_indices = candidate[mask].index
candidate_match_rows.extend(matching_indices.tolist())
match_rows_mask = candidate.index.isin(candidate_match_rows)
candidate.loc[match_rows_mask, 'gold'] = 1
candidate.fillna(value="", inplace=True)
# negative样本太多, 采样三倍于positive样本量
candidate_mismatch = candidate[candidate['gold'] == 0]
candidate_match = candidate[candidate['gold'] == 1]
candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
# 如果拼接后不重设索引可能导致索引重复
candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
cm.set_key(candidate_for_train_test, '_id')
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
cm.set_ltable(candidate_for_train_test, ltable)
cm.set_rtable(candidate_for_train_test, rtable)
block_recall = len(candidate_match) / matching_number
# 分为训练测试集
train_proportion = 0.5
sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
train_set = sets['train']
test_set = sets['test']
label_and_split_time = time.time()
print(f'Label and Split Time: {label_and_split_time - block_time}')
# 挖掘MD并保存本地
md_list = mining(train_set)
mining_time = time.time()
print(f'Mining Time: {mining_time - label_and_split_time}')
blocking_results = (ltable, rtable, train_set, test_set, md_list, block_recall)
# 将blocking结果保存到本地
with open(er_output_dir + "blocking_result.pickle", "wb") as file_:
pickle.dump(blocking_results, file_)
return blocking_results
def matching(config: Configuration, blocking_result_):
print(f'\033[33mConfig: {config}\033[0m')
start = time.time()
ltable = blocking_result_[0]
rtable = blocking_result_[1]
train_set = blocking_result_[2]
test_set = blocking_result_[3]
md_list = blocking_result_[4]
block_recall = blocking_result_[5]
train_set = pd.read_csv(directory_path + r'\train_whole.csv', encoding='ISO-8859-1')
test_set = pd.read_csv(directory_path + r'\test_whole.csv', encoding='ISO-8859-1')
ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1')
rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1')
ml_matcher = config["ml_matcher"]
match ml_matcher:
case "dt":
@ -109,22 +40,22 @@ def matching(config: Configuration, blocking_result_):
max_features=config['rf_max_features'])
cm.set_key(train_set, '_id')
cm.set_fk_ltable(train_set, 'ltable_' + ltable_id)
cm.set_fk_rtable(train_set, 'rtable_' + rtable_id)
cm.set_fk_ltable(train_set, 'ltable_id')
cm.set_fk_rtable(train_set, 'rtable_id')
cm.set_ltable(train_set, ltable)
cm.set_rtable(train_set, rtable)
cm.set_key(ltable, ltable_id)
cm.set_key(rtable, rtable_id)
cm.set_key(ltable, 'id')
cm.set_key(rtable, 'id')
cm.set_key(test_set, '_id')
cm.set_fk_ltable(test_set, 'ltable_' + ltable_id)
cm.set_fk_rtable(test_set, 'rtable_' + rtable_id)
cm.set_fk_ltable(test_set, 'ltable_id')
cm.set_fk_rtable(test_set, 'rtable_id')
cm.set_ltable(test_set, ltable)
cm.set_rtable(test_set, rtable)
feature_table = em.get_features_for_matching(ltable, rtable, validate_inferred_attr_types=False)
train_feature_vecs = em.extract_feature_vecs(train_set,
feature_table=feature_table,
attrs_after=['gold'],
attrs_after=['label'],
show_progress=False)
train_feature_vecs.fillna(value=0, inplace=True)
@ -132,22 +63,21 @@ def matching(config: Configuration, blocking_result_):
for _ in test_feature_after[:]:
test_feature_after.append(_.replace('ltable_', 'rtable_'))
for _ in test_feature_after:
if _.endswith(ltable_id) or _.endswith(rtable_id):
if _.endswith('id'):
test_feature_after.remove(_)
test_feature_after.append('gold')
test_feature_after.append('label')
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=test_feature_after, show_progress=False)
test_feature_vecs.fillna(value=0, inplace=True)
fit_exclude = ['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id, 'gold']
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
test_feature_after.extend(['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id])
fit_exclude = ['_id', 'ltable_id', 'rtable_id', 'label']
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='label')
test_feature_after.extend(['_id', 'ltable_id', 'rtable_id'])
predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'gold', 'predicted')
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)
indicators = evaluate_prediction(predictions, 'gold', 'predicted')
indicators['block_recall'] = block_recall
indicators = evaluate_prediction(predictions, 'label', 'predicted')
test_feature_after.remove('_id')
test_feature_after.append('predicted')
@ -158,31 +88,26 @@ def matching(config: Configuration, blocking_result_):
# 目前predictions包含的属性左右表全部属性+gold+predicted
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0
predictions['md'] = ''
epl_match = 0 # 可解释预测match
if len(md_list) > 0:
for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x
if str(getattr(row, 'predicted')) == str(1):
conf, md_dict = is_explicable(row, md_list, sim_tensor_dict)
if conf > 0:
predictions.loc[row[0], 'confidence'] = conf
predictions.loc[row[0], 'md'] = str(md_dict)
epl_match += 1
df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability
# note 既然不调block参数, 不妨假设block_recall很高, 不必考虑
# if indicators["block_recall"] < indicators["recall"]:
# f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
# indicators["precision"] + indicators["block_recall"])
# else:
# f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"]
indicators['performance'] = performance
print(f'ER Indicators: {indicators}')
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m')
print(Fore.BLUE + f'ER Indicators: {indicators}')
predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True)
print(Fore.CYAN + f'Finish Time: {time.time()}')
return indicators
@ -223,7 +148,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
col_tuple_list.append((left_index, right_index))
length = predictions.shape[0]
width = predictions.shape[1]
# width = predictions.shape[1]
predictions = predictions.reset_index(drop=True)
sentences = predictions.values.flatten(order='F').tolist()
@ -238,7 +163,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor, decimals=4)
sim_tensor = torch.round(sim_tensor, decimals=2)
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
@ -252,31 +177,28 @@ def is_explicable(row, all_mds: list, st_dict):
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return md_tuple[2] # 任意一条md能解释直接返回
return -1.0 # 遍历结束,不能解释
return md_tuple[2], md_tuple[0] # 任意一条md能解释直接返回
return -1.0, {} # 遍历结束,不能解释
def ml_er(config: Configuration, blocking_result_):
indicators = matching(config, blocking_result_)
output_path = er_output_dir + "eval_result.txt"
def ml_er(config: Configuration):
indicators = matching(config)
output_path = er_output_dir + r"\eval_result.txt"
with open(output_path, 'w') as _f:
_f.write('Precision:' + str(indicators["precision"]) + '\n')
_f.write('Recall:' + str(indicators["recall"]) + '\n')
_f.write('F1:' + str(indicators["F1"]) + '\n')
_f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
_f.write('interpretability:' + str(indicators['interpretability']) + '\n')
_f.write('performance:' + str(indicators['performance']) + '\n')
if __name__ == '__main__':
if os.path.isfile(hpo_output_dir + "incumbent.json"):
with open(hpo_output_dir + "configspace.json", 'r') as f:
if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
with open(hpo_output_dir + r"\configspace.json", 'r') as f:
dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
with open(hpo_output_dir + "incumbent.json", 'r') as f:
with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
dic = json.load(f)
configuration = ConfigSpace.Configuration(configspace, values=dic)
with open(er_output_dir + "blocking_result.pickle", "rb") as file:
blocking_result = pickle.load(file)
ml_er(configuration, blocking_result)
ml_er(configuration)

@ -1,4 +0,0 @@
from ml_er.magellan_er import blocking_mining
if __name__ == '__main__':
blocking_mining()

@ -1,363 +0,0 @@
import json
import os
import sys
import time
import ConfigSpace
import pandas
import torch
from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
import six
from ConfigSpace import Configuration
from tqdm import tqdm
from md_discovery.md_discover import md_discover
from settings import *
def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"):
df = train[train['gold'] == 1]
# 元组对左右ID调整一致
for index, row in df.iterrows():
df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
train_columns = train.columns.values.tolist()
l_columns = []
r_columns = []
cols = []
# 左表和右表字段名分别加入两个列表
for _ in train_columns:
if _.startswith('ltable'):
l_columns.append(_)
elif _.startswith('rtable'):
r_columns.append(_)
# 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致)
for _ in l_columns:
cols.append(_.replace('ltable_', ''))
ldf = df[l_columns]
rdf = df[r_columns]
ldf.columns = cols
rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf])
t_single_tuple = t_single_tuple.reset_index(drop=True)
t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
candidate: pd.DataFrame) -> dict:
new_df = df.reset_index(drop=False, inplace=False)
gold = new_df[labeled_attr]
predicted = new_df[predicted_attr]
gold_negative = gold[gold == 0].index.values
gold_positive = gold[gold == 1].index.values
predicted_negative = predicted[predicted == 0].index.values
predicted_positive = predicted[predicted == 1].index.values
false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
num_true_positives = float(len(true_positive_indices))
num_false_positives = float(len(false_positive_indices))
num_false_negatives = float(len(false_negative_indices))
precision_denominator = num_true_positives + num_false_positives
recall_denominator = num_true_positives + num_false_negatives
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
block_recall = len(candidate[candidate['gold'] == 1]) / matching_number
return {"precision": precision, "recall": recall, "F1": F1, "block_recall": block_recall}
def load_mds(paths: list) -> list:
if len(paths) == 0:
return []
all_mds = []
# 传入md路径列表
for md_path in paths:
if not os.path.exists(md_path):
continue
mds = []
# 打开每一个md文件
with open(md_path, 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
md_metadata = line.strip().split('\t')
# todo 如果MD文件的形式改了 这里也要改
md = eval(md_metadata[1])
mds.append(md)
all_mds.extend(mds)
return all_mds
def is_explicable(row, all_mds: list, st_dict):
attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段
for md_tuple in all_mds:
explicable = True # 假设这条md能解释当前元组
for a in attrs:
if a != target_attr:
if st_dict[a][row[0]].item() < md_tuple[0][a]:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return md_tuple[2] # 任意一条md能解释直接返回
return -1.0 # 遍历结束,不能解释
# 形成一个字典key为字段名称value为一维张量记录了预测表中这一字段每行的左右属性的相似度
def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
predictions_attrs = predictions.columns.values.tolist()
col_tuple_list = []
for _ in predictions_attrs:
if _.startswith('ltable'):
left_index = predictions_attrs.index(_)
right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
col_tuple_list.append((left_index, right_index))
length = predictions.shape[0]
width = predictions.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = predictions.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
# prediction的归一化嵌入张量
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor_dict = {}
for col_tuple in col_tuple_list:
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor, decimals=4)
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
def er_process(config: Configuration):
start = time.time()
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
# ltable.fillna("", inplace=True)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
# rtable.fillna("", inplace=True)
mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
# 仅保留两表中出现在映射表中的行,增大正样本比例
lid_mapping_list = []
rid_mapping_list = []
# 全部转为字符串
# ltable = ltable.astype(str)
# rtable = rtable.astype(str)
# mappings = mappings.astype(str)
matching_number = len(mappings) # 所有阳性样本数
for index, row in mappings.iterrows():
lid_mapping_list.append(row[mapping_lid])
rid_mapping_list.append(row[mapping_rid])
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
tables_id = rtable_id
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]
attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]
cm.set_key(selected_ltable, tables_id)
cm.set_key(selected_rtable, tables_id)
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
config["block_attr"], allow_missing=True,
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=1, show_progress=False)
candidate['gold'] = 0
candidate = candidate.reset_index(drop=True)
candidate_match_rows = []
for row in candidate.itertuples():
l_id = getattr(row, 'ltable_' + tables_id)
map_row = mappings[mappings[mapping_lid] == l_id]
if map_row is not None:
r_id = map_row[mapping_rid]
for value in r_id:
if value == getattr(row, 'rtable_' + tables_id):
candidate_match_rows.append(row[0])
else:
continue
for _ in candidate_match_rows:
candidate.loc[_, 'gold'] = 1
candidate.fillna(value="", inplace=True)
# 裁剪负样本,保持正负样本数量一致
candidate_mismatch = candidate[candidate['gold'] == 0]
candidate_match = candidate[candidate['gold'] == 1]
if len(candidate_mismatch) > len(candidate_match):
candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
# 拼接正负样本
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
# if len(candidate_for_train_test) == 0:
# return 0
# 如果拼接后不重设索引可能导致索引重复
candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
cm.set_key(candidate_for_train_test, '_id')
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
cm.set_ltable(candidate_for_train_test, selected_ltable)
cm.set_rtable(candidate_for_train_test, selected_rtable)
block_recall = len(candidate_match) / matching_number
# 分为训练测试集
train_proportion = 0.7
sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
train_set = sets['train']
test_set = sets['test']
# cm.set_key(train_set, '_id')
# cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
# cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
# cm.set_ltable(train_set, selected_ltable)
# cm.set_rtable(train_set, selected_rtable)
#
# cm.set_key(test_set, '_id')
# cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
# cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
# cm.set_ltable(test_set, selected_ltable)
# cm.set_rtable(test_set, selected_rtable)
ml_matcher = config["ml_matcher"]
if ml_matcher == "dt":
matcher = em.DTMatcher(name='DecisionTree', random_state=0)
elif ml_matcher == "svm":
matcher = em.SVMMatcher(name='SVM', random_state=0)
elif ml_matcher == "rf":
matcher = em.RFMatcher(name='RF', random_state=0)
elif ml_matcher == "lg":
matcher = em.LogRegMatcher(name='LogReg', random_state=0)
elif ml_matcher == "ln":
matcher = em.LinRegMatcher(name='LinReg')
elif ml_matcher == "nb":
matcher = em.NBMatcher(name='NaiveBayes')
feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
train_feature_vecs = em.extract_feature_vecs(train_set,
feature_table=feature_table,
attrs_after=['gold'],
show_progress=False)
train_feature_vecs.fillna(value=0, inplace=True)
test_feature_after = attrs_with_l_prefix[:]
test_feature_after.extend(attrs_with_r_prefix)
for _ in test_feature_after:
if _.endswith(tables_id):
test_feature_after.remove(_)
test_feature_after.append('gold')
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=test_feature_after, show_progress=False)
test_feature_vecs.fillna(value=0, inplace=True)
fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'gold', 'predicted')
em.print_eval_summary(eval_result)
indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, candidate_for_train_test)
# 计算可解释性
################################################################################################################
predictions_attrs = []
predictions_attrs.extend(attrs_with_l_prefix)
predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs]
# 必须从训练集内挖MD
train_attrs = predictions_attrs[:]
train_attrs.remove('predicted')
train_set = train_set[train_attrs]
prepare_file_for_md_discovery(train_set)
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0
md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt")
md_paths = [md_output_dir + 'mds.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
unexplainable = pd.DataFrame()
if len(md_list) > 0:
for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x
epl_match += 1
# else:
# series = pd.Series(row)
# unexplainable = unexplainable._append(series, ignore_index=True)
# unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True)
# unexplainable.columns = predictions_attrs
# unexplainable = unexplainable[train_attrs]
# if len(unexplainable[unexplainable['gold'] == str(1)]) > 0:
# prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv')
# md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt")
df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability
if indicators["block_recall"] < indicators["recall"]:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
indicators["precision"] + indicators["block_recall"])
else:
f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
indicators['performance'] = performance
indicators['eval_result'] = eval_result
print(indicators)
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
################################################################################################################
print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m')
return indicators
def ml_er(config: Configuration = None):
indicators = er_process(config)
output_path = er_output_dir + "eval_result.txt"
with open(output_path, 'w') as f:
for key, value in six.iteritems(_get_metric(indicators['eval_result'])):
f.write(key + " : " + value)
f.write('\n')
f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
f.write('interpretability:' + str(indicators['interpretability']) + '\n')
f.write('performance:' + str(indicators['performance']) + '\n')
if __name__ == '__main__':
if os.path.isfile(hpo_output_dir + "incumbent.json"):
with open(hpo_output_dir + "configspace.json", 'r') as f:
dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
with open(hpo_output_dir + "incumbent.json", 'r') as f:
dic = json.load(f)
configuration = ConfigSpace.Configuration(configspace, values=dic)
ml_er(configuration)

@ -1,24 +1,12 @@
from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
mapping_lid = 'idAbt' # mapping表中左表id名
mapping_rid = 'idBuy' # mapping表中右表id名
ltable_block_attr = 'name'
rtable_block_attr = 'name'
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon'
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\ml_er\output'
md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\md_discovery\output'
hpo_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\hpo\output'
model = SentenceTransformer(r'E:\Data\Research\Models\all-MiniLM-L6-v2')
interpre_weight = 0 # 可解释性权重
similarity_threshold = 0.1
support_threshold = 1
confidence_threshold = 0.75
er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'

Loading…
Cancel
Save