完成Magellan实验

MD-metrics-HPO
HuangJintao 6 months ago
parent f1e732afaf
commit fe5c1288ed

@ -1,73 +1,73 @@
# 将数据点和MD一起聚类 # # 将数据点和MD一起聚类
import os # import os
import numpy as np # import numpy as np
import pandas as pd # import pandas as pd
from matplotlib import pyplot as plt # from matplotlib import pyplot as plt
#
from draw_md_cluster import DBSCAN # from draw_md_cluster import DBSCAN
from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict # from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict
#
#
def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_): # def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_):
clusterNum = len(set(labels_)) # clusterNum = len(set(labels_))
fig = plt.figure() # fig = plt.figure()
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown'] # scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
ax = fig.add_subplot(111, projection='3d') # ax = fig.add_subplot(111, projection='3d')
for i in range(-1, clusterNum): # for i in range(-1, clusterNum):
colorStyle = scatterColors[i % len(scatterColors)] # colorStyle = scatterColors[i % len(scatterColors)]
subCluster = md_data_[np.where(labels_ == i)] # subCluster = md_data_[np.where(labels_ == i)]
ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12) # ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12)
ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x') # ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x')
if pre_mismatch_points_.shape[0] > 0: # if pre_mismatch_points_.shape[0] > 0:
ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x') # ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x')
ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度 # ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度
ax.set_ylabel(md_keys_[1], rotation=-45) # ax.set_ylabel(md_keys_[1], rotation=-45)
ax.set_zlabel(md_keys_[2], rotation=0) # ax.set_zlabel(md_keys_[2], rotation=0)
plt.title(output_path_.split('\\')[-1].split('.')[0]) # plt.title(output_path_.split('\\')[-1].split('.')[0])
plt.savefig(output_path_, dpi=500) # plt.savefig(output_path_, dpi=500)
plt.show() # plt.show()
#
#
if __name__ == '__main__': # if __name__ == '__main__':
outcome_path = r'E:\Data\Research\Outcome' # outcome_path = r'E:\Data\Research\Outcome'
config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5' # config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()] # dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()]
for dataset_name in dataset_name_list: # for dataset_name in dataset_name_list:
absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径 # absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径
predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径 # predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径
pred = pd.read_csv(predictions) # pred = pd.read_csv(predictions)
pred = pred.astype(str) # pred = pred.astype(str)
# pred = pred[pred['predicted'] == str(1)] # # pred = pred[pred['predicted'] == str(1)]
sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred) # sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred)
# 选取的三个字段 # # 选取的三个字段
md_keys = [] # md_keys = []
with open(absolute_path, 'r') as f: # with open(absolute_path, 'r') as f:
# 读取每一行的md加入该文件的md列表 # # 读取每一行的md加入该文件的md列表
md_data = [] # md_data = []
for line in f.readlines(): # for line in f.readlines():
md_metadata = line.strip().split('\t') # md_metadata = line.strip().split('\t')
md_tuple = eval(md_metadata[1]) # md_tuple = eval(md_metadata[1])
md_keys = list(md_tuple[0].keys())[1:4] # md_keys = list(md_tuple[0].keys())[1:4]
md_values = list(md_tuple[0].values()) # md_values = list(md_tuple[0].values())
md_data.append(md_values[1:4]) # md_data.append(md_values[1:4])
if len(md_data) == 10000: # if len(md_data) == 10000:
break # break
#
pre_match_points = [] # pre_match_points = []
pre_mismatch_points = [] # pre_mismatch_points = []
for _ in pred.itertuples(): # for _ in pred.itertuples():
data_point_value = [] # data_point_value = []
for key in md_keys: # for key in md_keys:
sim_tensor = sim_tensor_dict[key] # sim_tensor = sim_tensor_dict[key]
data_point_value.append(round(float(sim_tensor[_[0]]), 4)) # data_point_value.append(round(float(sim_tensor[_[0]]), 4))
if getattr(_, 'predicted') == str(1): # if getattr(_, 'predicted') == str(1):
pre_match_points.append(data_point_value) # pre_match_points.append(data_point_value)
elif getattr(_, 'predicted') == str(0): # elif getattr(_, 'predicted') == str(0):
pre_mismatch_points.append(data_point_value) # pre_mismatch_points.append(data_point_value)
#
md_data = np.array(md_data, dtype=np.float32) # md_data = np.array(md_data, dtype=np.float32)
pre_match_points = np.array(pre_match_points, dtype=np.float32) # pre_match_points = np.array(pre_match_points, dtype=np.float32)
pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32) # pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32)
labels = DBSCAN(md_data, 0.5, 30) # labels = DBSCAN(md_data, 0.5, 30)
output_path = outcome_path + rf'\{dataset_name}_MD&data.png' # output_path = outcome_path + rf'\{dataset_name}_MD&data.png'
plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path) # plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path)

@ -1,12 +1,14 @@
import json import json
import pickle import pickle
import time
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
from ConfigSpace.read_and_write import json as csj from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario from colorama import Fore, init
from smac import HyperparameterOptimizationFacade, Scenario, BlackBoxFacade
from ml_er.magellan_er import matching from ml_er.magellan_er import matching
from settings import * from settings import *
@ -61,9 +63,7 @@ class Optimization:
def train(self, config: Configuration, seed: int = 0) -> float: def train(self, config: Configuration, seed: int = 0) -> float:
cm.del_catalog() cm.del_catalog()
with open(er_output_dir + "blocking_result.pickle", "rb") as file: indicators = matching(config)
blocking_result = pickle.load(file)
indicators = matching(config, blocking_result)
return 1 - indicators['performance'] return 1 - indicators['performance']
@ -73,20 +73,20 @@ def ml_er_hpo():
str_configspace = csj.write(cs) str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace) dict_configspace = json.loads(str_configspace)
# 将超参数空间保存本地 # 将超参数空间保存本地
with open(hpo_output_dir + "configspace.json", "w") as f: with open(hpo_output_dir + r"\configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4) json.dump(dict_configspace, f, indent=4)
scenario = Scenario( scenario = Scenario(
cs, cs,
crash_cost=1.0, crash_cost=1.0,
deterministic=True, deterministic=True,
n_trials=20, n_trials=16,
n_workers=1 n_workers=1
) )
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)
smac = HyperparameterOptimizationFacade( smac = BlackBoxFacade(
scenario, scenario,
optimization.train, optimization.train,
initial_design=initial_design, initial_design=initial_design,
@ -97,19 +97,21 @@ def ml_er_hpo():
incumbent_cost = smac.validate(incumbent) incumbent_cost = smac.validate(incumbent)
default = cs.get_default_configuration() default = cs.get_default_configuration()
default_cost = smac.validate(default) default_cost = smac.validate(default)
print(f"Default Cost: {default_cost}") print(Fore.BLUE + f"Default Cost: {default_cost}")
print(f"Incumbent Cost: {incumbent_cost}") print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")
if incumbent_cost > default_cost: if incumbent_cost > default_cost:
incumbent = default incumbent = default
print(f"Updated Incumbent Cost: {default_cost}") print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')
print(f"Optimized Configuration:{incumbent.values()}") print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")
with open(hpo_output_dir + "incumbent.json", "w") as f: with open(hpo_output_dir + r"\incumbent.json", "w") as f:
json.dump(dict(incumbent), f, indent=4) json.dump(dict(incumbent), f, indent=4)
return incumbent return incumbent
if __name__ == '__main__': if __name__ == '__main__':
init(autoreset=True)
print(Fore.CYAN + f'Start Time: {time.time()}')
ml_er_hpo() ml_er_hpo()

@ -1,200 +0,0 @@
import operator
import numpy as np
import pandas as pd
import copy
import torch
from ConfigSpace import Configuration
from tqdm import tqdm
from settings import model, similarity_threshold, support_threshold, confidence_threshold
def is_minimal(md, md_list, target_col):
# 假设这个md是minimal
if len(md_list) == 0:
return True
minimal = True
for _ in md_list:
if isinstance(_, tuple):
_ = _[0]
if _ != md:
other_cols = list(set(_.keys()) - {target_col})
# 假设列表中每一个md都使当前md不minimal
exist = True
# 如果左边任何一个大于,则假设不成立
for col in other_cols:
if _[col] > md[col]:
exist = False
break
# 如果右边小于,假设也不成立
if _[target_col] < md[target_col]:
exist = False
# 任何一次假设成立当前md不minimal
if exist:
minimal = False
break
return minimal
def pairs_inference(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
# sim_tensor = torch.round(sim_tensor, decimals=3)
# torch.save(sim_tensor, md_output_dir + "tensor.pt")
md_list = []
minimal_vio = []
init_md = {}
for col in columns:
init_md[col] = 1 if col == target_col else -1
md_list.append(init_md)
for row1 in tqdm(range(0, length - 1)):
terminate = False
for row2 in range(row1 + 1, length):
violated_mds = []
# sims是两行的相似度
sims = {}
for col_index in range(0, width):
col = columns[col_index]
similarity = sim_tensor[col_index, row1, row2].item()
sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表
for md in md_list[:]:
lhs_satis = True
rhs_satis = True
for col in cols_but_target:
if sims[col] < md[col]:
lhs_satis = False
break
if sims[target_col] < md[target_col]:
rhs_satis = False
if lhs_satis == True and rhs_satis == False:
md_list.remove(md)
violated_mds.append(md)
# for vio_md in violated_mds:
# # 特殊化左侧
# for col in cols_but_target:
# if sims[col] + 0.01 <= 1:
# spec_l_md = copy.deepcopy(vio_md)
# spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
# if is_minimal(spec_l_md, md_list, target_col):
# md_list.append(spec_l_md)
# if vio_md not in minimal_vio:
# minimal_vio.append(vio_md)
for vio_md in violated_mds:
vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
if vio_md_support >= support_threshold:
for col in cols_but_target:
if sims[col] < 1.0:
spec_l_md = copy.deepcopy(vio_md)
if sims[col] < similarity_threshold:
spec_l_md[col] = similarity_threshold
else:
if sims[col] + 0.01 <= 1.0:
spec_l_md[col] = sims[col] + 0.01
else:
spec_l_md[col] = 1.0
if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md)
if vio_md not in minimal_vio:
minimal_vio.append(vio_md)
if len(md_list) == 0:
terminate = True
break
if terminate:
break
if len(minimal_vio) > 0:
for md in minimal_vio[:]:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
if support >= support_threshold and confidence >= confidence_threshold:
minimal_vio.append((md, support, confidence))
minimal_vio.remove(md)
if len(md_list) > 0:
# 去除重复MD
tmp = []
for _ in md_list:
if _ not in tmp:
tmp.append(_)
md_list = tmp
# 去除support小于阈值MD
for _ in md_list[:]:
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
if support >= support_threshold and confidence >= confidence_threshold:
md_list.append((_, support, confidence))
md_list.remove(_)
# 去除不minimal的MD
for md_tuple in md_list[:]:
if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5:
md_list.remove(md_tuple)
if len(minimal_vio) > 0:
for vio_tuple in minimal_vio[:]:
if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5:
minimal_vio.remove(vio_tuple)
if len(minimal_vio) > 0:
for vio_tuple in minimal_vio[:]:
if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5:
minimal_vio.remove(vio_tuple)
result = []
result.extend(md_list)
result.extend(minimal_vio)
result.sort(key=operator.itemgetter(2), reverse=True)
print(f'\033[33mList Length: {len(result)}\033[0m')
return result
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
columns = data.columns.values.tolist()
length = data.shape[0]
width = data.shape[1]
md_tensor = list(current_md.values())
md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = ini_slice.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence

@ -1,138 +0,0 @@
import math
import operator
import random
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
sample_number = 100000
step_length = 0.01
def get_metrics(md_tensor, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
# md_tensor = list(current_md.values())
# md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = conf_tensor.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence
def build_cartesian(width, target_index):
all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
num=math.ceil((1-similarity_threshold)/step_length) + 1)
all_values_array = np.round(all_values_array, 4)
all_values_tensor = torch.tensor(all_values_array, device='cuda')
all_values_tensor = all_values_tensor.float()
all_values_tensor = torch.round(all_values_tensor, decimals=4)
tensors_for_cartesian = []
for i in range(0, width):
if i == target_index:
t = torch.tensor([1.0], device='cuda')
tensors_for_cartesian.append(t)
else:
tensors_for_cartesian.append(all_values_tensor)
result = torch.cartesian_prod(*tensors_for_cartesian)
return result
def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
cartesian_product = cartesian_product.unsqueeze(2)
cartesian_product = cartesian_product.unsqueeze(3)
cartesian_product = cartesian_product.repeat(1, 1, length, length)
def discover(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
# 除了目标列外所有列的索引
columns_indices = [_ for _ in range(0, width) if _ != target_index]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor.float()
sim_tensor = torch.round(sim_tensor, decimals=4)
# 小于6列的可以尝试做笛卡尔积大于6列可能指数爆炸
if width < 6:
# 列出除目标列以外所有列的所有取值做笛卡尔积结果为所有可能MD取值
cartesian = build_cartesian(width, target_index)
# 抽取sample_number / (width - 1)条MD不含-1
if cartesian.shape[0] > sample_number / (width - 1):
index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
cartesian = torch.index_select(cartesian, 0, index)
else:
# 随机生成sample_number / (width - 1)条MD使用randint先转化为int再除成小数不含-1
cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
(math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
cartesian = cartesian / 100
# 生成一列相似度为1的目标列插入目标列所在位置
ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
cartesian = torch.round(cartesian, decimals=4)
# 此tensor将与其他置为-1的tensor拼接
joint_md_tensor = cartesian.clone()
# 随机将1列2列……置为-1
for i in range(width - 2):
index_list_format = []
for j in range(cartesian.shape[0]):
# 对每条MD随机选择将要置为-1的列索引
index_list_format.append(random.sample(columns_indices, i + 1))
index = torch.tensor(index_list_format, device='cuda')
# 随机调整为-1后的MD集合
modified_cartesian = cartesian.scatter(1, index, -1)
joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
md_list = []
# get_metric_tensor(cartesian, data, sim_tensor, target_index)
for _ in tqdm(range(joint_md_tensor.shape[0])):
s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
if s >= support_threshold and c >= confidence_threshold:
md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
md_dict_format = {}
for k in range(0, width):
md_dict_format[columns[k]] = md_list_format[k]
md_list.append((md_dict_format, s, c))
md_list.sort(key=operator.itemgetter(2), reverse=True)
return md_list

@ -1,27 +0,0 @@
from ConfigSpace import Configuration
from md_discovery.discovery_executor import pairs_inference
from md_discovery.discovery_executor_gpu import discover
from settings import *
# # 若不输出support和confidence使用以下两块代码
# # 将列表1写入本地路径需自己修改
# md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt'
# with open(md_path, 'w') as f:
# for _ in mds:
# f.write(str(_) + '\n')
#
# # 将列表2写入本地路径需自己修改
# vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt'
# with open(vio_path, 'w') as f:
# for _ in vio:
# f.write(str(_) + '\n')
def md_discover(config: Configuration, source_path, target_path):
mds_list = discover(source_path, target_attr)
with open(target_path, 'w') as f:
for _ in mds_list:
f.write('Target:'+str(target_attr) + '\t')
f.write(str(_))
f.write('\n')

@ -1,4 +1,5 @@
import itertools import itertools
import pickle
import random import random
import operator import operator
from operator import itemgetter from operator import itemgetter
@ -16,10 +17,12 @@ from settings import *
def mining(train: pd.DataFrame): def mining(train: pd.DataFrame):
# data is train set, in which each row represents a tuple pair # data is train set, in which each row represents a tuple pair
train = train.astype(str) train = train.astype(str)
# 将label列移到最后
train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1)
# 尝试不将左右表key手动调整相同而是只看gold属性是否为1 # 尝试不将左右表key手动调整相同而是只看gold属性是否为1
# 故将左右表key直接去除 # 故将左右表key直接去除
data = train.drop(columns=['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id], inplace=False) data = train.drop(columns=['_id', 'ltable_id', 'rtable_id'], inplace=False)
# data中现存属性除key以外左右表属性和gold, 不含_id # data中现存属性除key以外左右表属性和gold, 不含_id
columns = data.columns.values.tolist() columns = data.columns.values.tolist()
columns_without_prefix = [_.replace('ltable_', '') for _ in columns if _.startswith('ltable_')] columns_without_prefix = [_.replace('ltable_', '') for _ in columns if _.startswith('ltable_')]
@ -62,7 +65,7 @@ def mining(train: pd.DataFrame):
# 生成带标签的相似度张量 # 生成带标签的相似度张量
sim_table_tensor_labeled = torch.cat((sim_table_tensor, label_tensor), 1) sim_table_tensor_labeled = torch.cat((sim_table_tensor, label_tensor), 1)
# 找到匹配元组对的行索引 # 找到匹配元组对的行索引
mask = (data['gold'].isin(['1'])) mask = (data['label'].isin(['1']))
match_pair_indices = data[mask].index.tolist() match_pair_indices = data[mask].index.tolist()
# 根据索引将匹配的行标签置为1 # 根据索引将匹配的行标签置为1
sim_table_tensor_labeled[match_pair_indices, -1] = 1.00 sim_table_tensor_labeled[match_pair_indices, -1] = 1.00
@ -209,7 +212,7 @@ def build_candidate_md_matrix(sorted_unique_value_tensor_list_: list):
def mds_to_txt(result_list_): def mds_to_txt(result_list_):
p = md_output_dir + "mds.txt" p = md_output_dir + r"\mds.txt"
with open(p, 'w') as f: with open(p, 'w') as f:
for _ in result_list_: for _ in result_list_:
f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}') f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}')
@ -253,3 +256,10 @@ def merge_mds(md_list_):
del same_sc_list[index] del same_sc_list[index]
# 二级列表转一级列表 # 二级列表转一级列表
return list(itertools.chain.from_iterable(grouped_md_tuples)) return list(itertools.chain.from_iterable(grouped_md_tuples))
if __name__ == '__main__':
_train = pd.read_csv(directory_path + r'\train_whole.csv')
result = mining(_train)
with open(md_output_dir + r"\mds.pickle", "wb") as file_:
pickle.dump(result, file_)

@ -11,89 +11,20 @@ from ConfigSpace import Configuration
from ConfigSpace.read_and_write import json as csj from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.catalog.catalog_manager as cm
from tqdm import tqdm from tqdm import tqdm
from colorama import Fore
from md_discovery.md_mining import mining
from settings import * from settings import *
def blocking_mining(): def matching(config: Configuration):
start = time.time() print(Fore.BLUE + f'Config: {config}')
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') with open(md_output_dir + r"\mds.pickle", "rb") as file:
cm.set_key(ltable, ltable_id) md_list = pickle.load(file)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
matching_number = len(mappings)
# if ltable_id == rtable_id:
# tables_id = rtable_id
attributes = ltable.columns.values.tolist()
# lattributes = ['ltable_' + i for i in attributes]
# rattributes = ['rtable_' + i for i in attributes]
cm.set_key(ltable, ltable_id)
cm.set_key(rtable, rtable_id)
blocker = em.OverlapBlocker() train_set = pd.read_csv(directory_path + r'\train_whole.csv', encoding='ISO-8859-1')
candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True, test_set = pd.read_csv(directory_path + r'\test_whole.csv', encoding='ISO-8859-1')
l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1, ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1')
overlap_size=1, show_progress=False) rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1')
candidate['gold'] = 0
candidate = candidate.reset_index(drop=True)
block_time = time.time()
print(f'Block Time: {block_time - start}')
# 根据mapping表标注数据
candidate_match_rows = []
for t in tqdm(mappings.itertuples()):
mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, mapping_lid)])) &
(candidate['rtable_' + rtable_id].isin([getattr(t, mapping_rid)])))
matching_indices = candidate[mask].index
candidate_match_rows.extend(matching_indices.tolist())
match_rows_mask = candidate.index.isin(candidate_match_rows)
candidate.loc[match_rows_mask, 'gold'] = 1
candidate.fillna(value="", inplace=True)
# negative样本太多, 采样三倍于positive样本量
candidate_mismatch = candidate[candidate['gold'] == 0]
candidate_match = candidate[candidate['gold'] == 1]
candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
# 如果拼接后不重设索引可能导致索引重复
candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
cm.set_key(candidate_for_train_test, '_id')
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
cm.set_ltable(candidate_for_train_test, ltable)
cm.set_rtable(candidate_for_train_test, rtable)
block_recall = len(candidate_match) / matching_number
# 分为训练测试集
train_proportion = 0.5
sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
train_set = sets['train']
test_set = sets['test']
label_and_split_time = time.time()
print(f'Label and Split Time: {label_and_split_time - block_time}')
# 挖掘MD并保存本地
md_list = mining(train_set)
mining_time = time.time()
print(f'Mining Time: {mining_time - label_and_split_time}')
blocking_results = (ltable, rtable, train_set, test_set, md_list, block_recall)
# 将blocking结果保存到本地
with open(er_output_dir + "blocking_result.pickle", "wb") as file_:
pickle.dump(blocking_results, file_)
return blocking_results
def matching(config: Configuration, blocking_result_):
print(f'\033[33mConfig: {config}\033[0m')
start = time.time()
ltable = blocking_result_[0]
rtable = blocking_result_[1]
train_set = blocking_result_[2]
test_set = blocking_result_[3]
md_list = blocking_result_[4]
block_recall = blocking_result_[5]
ml_matcher = config["ml_matcher"] ml_matcher = config["ml_matcher"]
match ml_matcher: match ml_matcher:
case "dt": case "dt":
@ -109,22 +40,22 @@ def matching(config: Configuration, blocking_result_):
max_features=config['rf_max_features']) max_features=config['rf_max_features'])
cm.set_key(train_set, '_id') cm.set_key(train_set, '_id')
cm.set_fk_ltable(train_set, 'ltable_' + ltable_id) cm.set_fk_ltable(train_set, 'ltable_id')
cm.set_fk_rtable(train_set, 'rtable_' + rtable_id) cm.set_fk_rtable(train_set, 'rtable_id')
cm.set_ltable(train_set, ltable) cm.set_ltable(train_set, ltable)
cm.set_rtable(train_set, rtable) cm.set_rtable(train_set, rtable)
cm.set_key(ltable, ltable_id) cm.set_key(ltable, 'id')
cm.set_key(rtable, rtable_id) cm.set_key(rtable, 'id')
cm.set_key(test_set, '_id') cm.set_key(test_set, '_id')
cm.set_fk_ltable(test_set, 'ltable_' + ltable_id) cm.set_fk_ltable(test_set, 'ltable_id')
cm.set_fk_rtable(test_set, 'rtable_' + rtable_id) cm.set_fk_rtable(test_set, 'rtable_id')
cm.set_ltable(test_set, ltable) cm.set_ltable(test_set, ltable)
cm.set_rtable(test_set, rtable) cm.set_rtable(test_set, rtable)
feature_table = em.get_features_for_matching(ltable, rtable, validate_inferred_attr_types=False) feature_table = em.get_features_for_matching(ltable, rtable, validate_inferred_attr_types=False)
train_feature_vecs = em.extract_feature_vecs(train_set, train_feature_vecs = em.extract_feature_vecs(train_set,
feature_table=feature_table, feature_table=feature_table,
attrs_after=['gold'], attrs_after=['label'],
show_progress=False) show_progress=False)
train_feature_vecs.fillna(value=0, inplace=True) train_feature_vecs.fillna(value=0, inplace=True)
@ -132,22 +63,21 @@ def matching(config: Configuration, blocking_result_):
for _ in test_feature_after[:]: for _ in test_feature_after[:]:
test_feature_after.append(_.replace('ltable_', 'rtable_')) test_feature_after.append(_.replace('ltable_', 'rtable_'))
for _ in test_feature_after: for _ in test_feature_after:
if _.endswith(ltable_id) or _.endswith(rtable_id): if _.endswith('id'):
test_feature_after.remove(_) test_feature_after.remove(_)
test_feature_after.append('gold') test_feature_after.append('label')
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=test_feature_after, show_progress=False) attrs_after=test_feature_after, show_progress=False)
test_feature_vecs.fillna(value=0, inplace=True) test_feature_vecs.fillna(value=0, inplace=True)
fit_exclude = ['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id, 'gold'] fit_exclude = ['_id', 'ltable_id', 'rtable_id', 'label']
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='label')
test_feature_after.extend(['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id]) test_feature_after.extend(['_id', 'ltable_id', 'rtable_id'])
predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after, predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
append=True, target_attr='predicted', inplace=False) append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'gold', 'predicted') eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result) em.print_eval_summary(eval_result)
indicators = evaluate_prediction(predictions, 'gold', 'predicted') indicators = evaluate_prediction(predictions, 'label', 'predicted')
indicators['block_recall'] = block_recall
test_feature_after.remove('_id') test_feature_after.remove('_id')
test_feature_after.append('predicted') test_feature_after.append('predicted')
@ -158,31 +88,26 @@ def matching(config: Configuration, blocking_result_):
# 目前predictions包含的属性左右表全部属性+gold+predicted # 目前predictions包含的属性左右表全部属性+gold+predicted
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0 predictions['confidence'] = 0
predictions['md'] = ''
epl_match = 0 # 可解释预测match epl_match = 0 # 可解释预测match
if len(md_list) > 0: if len(md_list) > 0:
for row in tqdm(predictions.itertuples()): for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict) if str(getattr(row, 'predicted')) == str(1):
if x > 0 and str(getattr(row, 'predicted')) == str(1): conf, md_dict = is_explicable(row, md_list, sim_tensor_dict)
predictions.loc[row[0], 'confidence'] = x if conf > 0:
epl_match += 1 predictions.loc[row[0], 'confidence'] = conf
predictions.loc[row[0], 'md'] = str(md_dict)
epl_match += 1
df = predictions[predictions['predicted'] == str(1)] df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性 interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability indicators['interpretability'] = interpretability
# note 既然不调block参数, 不妨假设block_recall很高, 不必考虑
# if indicators["block_recall"] < indicators["recall"]:
# f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
# indicators["precision"] + indicators["block_recall"])
# else:
# f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"] performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"]
indicators['performance'] = performance indicators['performance'] = performance
print(f'ER Indicators: {indicators}') print(Fore.BLUE + f'ER Indicators: {indicators}')
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True) predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True)
print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m') print(Fore.CYAN + f'Finish Time: {time.time()}')
return indicators return indicators
@ -223,7 +148,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
col_tuple_list.append((left_index, right_index)) col_tuple_list.append((left_index, right_index))
length = predictions.shape[0] length = predictions.shape[0]
width = predictions.shape[1] # width = predictions.shape[1]
predictions = predictions.reset_index(drop=True) predictions = predictions.reset_index(drop=True)
sentences = predictions.values.flatten(order='F').tolist() sentences = predictions.values.flatten(order='F').tolist()
@ -238,7 +163,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
rattr_tensor = norm_table_tensor[col_tuple[1]] rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1) sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor, decimals=4) sim_tensor = torch.round(sim_tensor, decimals=2)
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict return sim_tensor_dict
@ -252,31 +177,28 @@ def is_explicable(row, all_mds: list, st_dict):
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组 explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable: if explicable:
return md_tuple[2] # 任意一条md能解释直接返回 return md_tuple[2], md_tuple[0] # 任意一条md能解释直接返回
return -1.0 # 遍历结束,不能解释 return -1.0, {} # 遍历结束,不能解释
def ml_er(config: Configuration, blocking_result_): def ml_er(config: Configuration):
indicators = matching(config, blocking_result_) indicators = matching(config)
output_path = er_output_dir + "eval_result.txt" output_path = er_output_dir + r"\eval_result.txt"
with open(output_path, 'w') as _f: with open(output_path, 'w') as _f:
_f.write('Precision:' + str(indicators["precision"]) + '\n') _f.write('Precision:' + str(indicators["precision"]) + '\n')
_f.write('Recall:' + str(indicators["recall"]) + '\n') _f.write('Recall:' + str(indicators["recall"]) + '\n')
_f.write('F1:' + str(indicators["F1"]) + '\n') _f.write('F1:' + str(indicators["F1"]) + '\n')
_f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
_f.write('interpretability:' + str(indicators['interpretability']) + '\n') _f.write('interpretability:' + str(indicators['interpretability']) + '\n')
_f.write('performance:' + str(indicators['performance']) + '\n') _f.write('performance:' + str(indicators['performance']) + '\n')
if __name__ == '__main__': if __name__ == '__main__':
if os.path.isfile(hpo_output_dir + "incumbent.json"): if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
with open(hpo_output_dir + "configspace.json", 'r') as f: with open(hpo_output_dir + r"\configspace.json", 'r') as f:
dict_configspace = json.load(f) dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace) str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace) configspace = csj.read(str_configspace)
with open(hpo_output_dir + "incumbent.json", 'r') as f: with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
dic = json.load(f) dic = json.load(f)
configuration = ConfigSpace.Configuration(configspace, values=dic) configuration = ConfigSpace.Configuration(configspace, values=dic)
with open(er_output_dir + "blocking_result.pickle", "rb") as file: ml_er(configuration)
blocking_result = pickle.load(file)
ml_er(configuration, blocking_result)

@ -1,4 +0,0 @@
from ml_er.magellan_er import blocking_mining
if __name__ == '__main__':
blocking_mining()

@ -1,363 +0,0 @@
import json
import os
import sys
import time
import ConfigSpace
import pandas
import torch
from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
import six
from ConfigSpace import Configuration
from tqdm import tqdm
from md_discovery.md_discover import md_discover
from settings import *
def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"):
df = train[train['gold'] == 1]
# 元组对左右ID调整一致
for index, row in df.iterrows():
df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
train_columns = train.columns.values.tolist()
l_columns = []
r_columns = []
cols = []
# 左表和右表字段名分别加入两个列表
for _ in train_columns:
if _.startswith('ltable'):
l_columns.append(_)
elif _.startswith('rtable'):
r_columns.append(_)
# 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致)
for _ in l_columns:
cols.append(_.replace('ltable_', ''))
ldf = df[l_columns]
rdf = df[r_columns]
ldf.columns = cols
rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf])
t_single_tuple = t_single_tuple.reset_index(drop=True)
t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
candidate: pd.DataFrame) -> dict:
new_df = df.reset_index(drop=False, inplace=False)
gold = new_df[labeled_attr]
predicted = new_df[predicted_attr]
gold_negative = gold[gold == 0].index.values
gold_positive = gold[gold == 1].index.values
predicted_negative = predicted[predicted == 0].index.values
predicted_positive = predicted[predicted == 1].index.values
false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
num_true_positives = float(len(true_positive_indices))
num_false_positives = float(len(false_positive_indices))
num_false_negatives = float(len(false_negative_indices))
precision_denominator = num_true_positives + num_false_positives
recall_denominator = num_true_positives + num_false_negatives
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
block_recall = len(candidate[candidate['gold'] == 1]) / matching_number
return {"precision": precision, "recall": recall, "F1": F1, "block_recall": block_recall}
def load_mds(paths: list) -> list:
if len(paths) == 0:
return []
all_mds = []
# 传入md路径列表
for md_path in paths:
if not os.path.exists(md_path):
continue
mds = []
# 打开每一个md文件
with open(md_path, 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
md_metadata = line.strip().split('\t')
# todo 如果MD文件的形式改了 这里也要改
md = eval(md_metadata[1])
mds.append(md)
all_mds.extend(mds)
return all_mds
def is_explicable(row, all_mds: list, st_dict):
attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段
for md_tuple in all_mds:
explicable = True # 假设这条md能解释当前元组
for a in attrs:
if a != target_attr:
if st_dict[a][row[0]].item() < md_tuple[0][a]:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return md_tuple[2] # 任意一条md能解释直接返回
return -1.0 # 遍历结束,不能解释
# 形成一个字典key为字段名称value为一维张量记录了预测表中这一字段每行的左右属性的相似度
def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
predictions_attrs = predictions.columns.values.tolist()
col_tuple_list = []
for _ in predictions_attrs:
if _.startswith('ltable'):
left_index = predictions_attrs.index(_)
right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
col_tuple_list.append((left_index, right_index))
length = predictions.shape[0]
width = predictions.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = predictions.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
# prediction的归一化嵌入张量
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor_dict = {}
for col_tuple in col_tuple_list:
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor, decimals=4)
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
def er_process(config: Configuration):
start = time.time()
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
# ltable.fillna("", inplace=True)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
# rtable.fillna("", inplace=True)
mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
# 仅保留两表中出现在映射表中的行,增大正样本比例
lid_mapping_list = []
rid_mapping_list = []
# 全部转为字符串
# ltable = ltable.astype(str)
# rtable = rtable.astype(str)
# mappings = mappings.astype(str)
matching_number = len(mappings) # 所有阳性样本数
for index, row in mappings.iterrows():
lid_mapping_list.append(row[mapping_lid])
rid_mapping_list.append(row[mapping_rid])
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
tables_id = rtable_id
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]
attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]
cm.set_key(selected_ltable, tables_id)
cm.set_key(selected_rtable, tables_id)
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
config["block_attr"], allow_missing=True,
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=1, show_progress=False)
candidate['gold'] = 0
candidate = candidate.reset_index(drop=True)
candidate_match_rows = []
for row in candidate.itertuples():
l_id = getattr(row, 'ltable_' + tables_id)
map_row = mappings[mappings[mapping_lid] == l_id]
if map_row is not None:
r_id = map_row[mapping_rid]
for value in r_id:
if value == getattr(row, 'rtable_' + tables_id):
candidate_match_rows.append(row[0])
else:
continue
for _ in candidate_match_rows:
candidate.loc[_, 'gold'] = 1
candidate.fillna(value="", inplace=True)
# 裁剪负样本,保持正负样本数量一致
candidate_mismatch = candidate[candidate['gold'] == 0]
candidate_match = candidate[candidate['gold'] == 1]
if len(candidate_mismatch) > len(candidate_match):
candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
# 拼接正负样本
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
# if len(candidate_for_train_test) == 0:
# return 0
# 如果拼接后不重设索引可能导致索引重复
candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
cm.set_key(candidate_for_train_test, '_id')
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
cm.set_ltable(candidate_for_train_test, selected_ltable)
cm.set_rtable(candidate_for_train_test, selected_rtable)
block_recall = len(candidate_match) / matching_number
# 分为训练测试集
train_proportion = 0.7
sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
train_set = sets['train']
test_set = sets['test']
# cm.set_key(train_set, '_id')
# cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
# cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
# cm.set_ltable(train_set, selected_ltable)
# cm.set_rtable(train_set, selected_rtable)
#
# cm.set_key(test_set, '_id')
# cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
# cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
# cm.set_ltable(test_set, selected_ltable)
# cm.set_rtable(test_set, selected_rtable)
ml_matcher = config["ml_matcher"]
if ml_matcher == "dt":
matcher = em.DTMatcher(name='DecisionTree', random_state=0)
elif ml_matcher == "svm":
matcher = em.SVMMatcher(name='SVM', random_state=0)
elif ml_matcher == "rf":
matcher = em.RFMatcher(name='RF', random_state=0)
elif ml_matcher == "lg":
matcher = em.LogRegMatcher(name='LogReg', random_state=0)
elif ml_matcher == "ln":
matcher = em.LinRegMatcher(name='LinReg')
elif ml_matcher == "nb":
matcher = em.NBMatcher(name='NaiveBayes')
feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
train_feature_vecs = em.extract_feature_vecs(train_set,
feature_table=feature_table,
attrs_after=['gold'],
show_progress=False)
train_feature_vecs.fillna(value=0, inplace=True)
test_feature_after = attrs_with_l_prefix[:]
test_feature_after.extend(attrs_with_r_prefix)
for _ in test_feature_after:
if _.endswith(tables_id):
test_feature_after.remove(_)
test_feature_after.append('gold')
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=test_feature_after, show_progress=False)
test_feature_vecs.fillna(value=0, inplace=True)
fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'gold', 'predicted')
em.print_eval_summary(eval_result)
indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, candidate_for_train_test)
# 计算可解释性
################################################################################################################
predictions_attrs = []
predictions_attrs.extend(attrs_with_l_prefix)
predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs]
# 必须从训练集内挖MD
train_attrs = predictions_attrs[:]
train_attrs.remove('predicted')
train_set = train_set[train_attrs]
prepare_file_for_md_discovery(train_set)
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0
md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt")
md_paths = [md_output_dir + 'mds.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
unexplainable = pd.DataFrame()
if len(md_list) > 0:
for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x
epl_match += 1
# else:
# series = pd.Series(row)
# unexplainable = unexplainable._append(series, ignore_index=True)
# unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True)
# unexplainable.columns = predictions_attrs
# unexplainable = unexplainable[train_attrs]
# if len(unexplainable[unexplainable['gold'] == str(1)]) > 0:
# prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv')
# md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt")
df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability
if indicators["block_recall"] < indicators["recall"]:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
indicators["precision"] + indicators["block_recall"])
else:
f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
indicators['performance'] = performance
indicators['eval_result'] = eval_result
print(indicators)
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
################################################################################################################
print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m')
return indicators
def ml_er(config: Configuration = None):
indicators = er_process(config)
output_path = er_output_dir + "eval_result.txt"
with open(output_path, 'w') as f:
for key, value in six.iteritems(_get_metric(indicators['eval_result'])):
f.write(key + " : " + value)
f.write('\n')
f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
f.write('interpretability:' + str(indicators['interpretability']) + '\n')
f.write('performance:' + str(indicators['performance']) + '\n')
if __name__ == '__main__':
if os.path.isfile(hpo_output_dir + "incumbent.json"):
with open(hpo_output_dir + "configspace.json", 'r') as f:
dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
with open(hpo_output_dir + "incumbent.json", 'r') as f:
dic = json.load(f)
configuration = ConfigSpace.Configuration(configspace, values=dic)
ml_er(configuration)

@ -1,24 +1,12 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv' directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon'
rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
mapping_lid = 'idAbt' # mapping表中左表id名
mapping_rid = 'idBuy' # mapping表中右表id名
ltable_block_attr = 'name'
rtable_block_attr = 'name'
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2') er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\ml_er\output'
md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\md_discovery\output'
hpo_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\hpo\output'
model = SentenceTransformer(r'E:\Data\Research\Models\all-MiniLM-L6-v2')
interpre_weight = 0 # 可解释性权重 interpre_weight = 0 # 可解释性权重
similarity_threshold = 0.1
support_threshold = 1 support_threshold = 1
confidence_threshold = 0.75 confidence_threshold = 0.75
er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'

Loading…
Cancel
Save