From fe5c1288edc8333e92807b3ab995a868e8918845 Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Wed, 12 Jun 2024 15:08:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90Magellan=E5=AE=9E=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draw/draw_md_cluster_with_data_point.py | 146 +++++----- hpo/magellan_hpo.py | 28 +- md_discovery/discovery_executor.py | 200 ------------- md_discovery/discovery_executor_gpu.py | 138 --------- md_discovery/md_discover.py | 27 -- md_discovery/md_mining.py | 16 +- ml_er/magellan_er.py | 166 +++-------- ml_er/magellan_start.py | 4 - ml_er/ml_entity_resolver.py | 363 ------------------------ settings.py | 24 +- 10 files changed, 151 insertions(+), 961 deletions(-) delete mode 100644 md_discovery/discovery_executor.py delete mode 100644 md_discovery/discovery_executor_gpu.py delete mode 100644 md_discovery/md_discover.py delete mode 100644 ml_er/magellan_start.py delete mode 100644 ml_er/ml_entity_resolver.py diff --git a/draw/draw_md_cluster_with_data_point.py b/draw/draw_md_cluster_with_data_point.py index f373c88..9b43f7a 100644 --- a/draw/draw_md_cluster_with_data_point.py +++ b/draw/draw_md_cluster_with_data_point.py @@ -1,73 +1,73 @@ -# 将数据点和MD一起聚类 -import os -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt - -from draw_md_cluster import DBSCAN -from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict - - -def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_): - clusterNum = len(set(labels_)) - fig = plt.figure() - scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown'] - ax = fig.add_subplot(111, projection='3d') - for i in range(-1, clusterNum): - colorStyle = scatterColors[i % len(scatterColors)] - subCluster = md_data_[np.where(labels_ == i)] - ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12) - ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x') - if pre_mismatch_points_.shape[0] > 0: - ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x') - ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度 - ax.set_ylabel(md_keys_[1], rotation=-45) - ax.set_zlabel(md_keys_[2], rotation=0) - plt.title(output_path_.split('\\')[-1].split('.')[0]) - plt.savefig(output_path_, dpi=500) - plt.show() - - -if __name__ == '__main__': - outcome_path = r'E:\Data\Research\Outcome' - config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5' - dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()] - for dataset_name in dataset_name_list: - absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径 - predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径 - pred = pd.read_csv(predictions) - pred = pred.astype(str) - # pred = pred[pred['predicted'] == str(1)] - sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred) - # 选取的三个字段 - md_keys = [] - with open(absolute_path, 'r') as f: - # 读取每一行的md,加入该文件的md列表 - md_data = [] - for line in f.readlines(): - md_metadata = line.strip().split('\t') - md_tuple = eval(md_metadata[1]) - md_keys = list(md_tuple[0].keys())[1:4] - md_values = list(md_tuple[0].values()) - md_data.append(md_values[1:4]) - if len(md_data) == 10000: - break - - pre_match_points = [] - pre_mismatch_points = [] - for _ in pred.itertuples(): - data_point_value = [] - for key in md_keys: - sim_tensor = sim_tensor_dict[key] - data_point_value.append(round(float(sim_tensor[_[0]]), 4)) - if getattr(_, 'predicted') == str(1): - pre_match_points.append(data_point_value) - elif getattr(_, 'predicted') == str(0): - pre_mismatch_points.append(data_point_value) - - md_data = np.array(md_data, dtype=np.float32) - pre_match_points = np.array(pre_match_points, dtype=np.float32) - pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32) - labels = DBSCAN(md_data, 0.5, 30) - output_path = outcome_path + rf'\{dataset_name}_MD&data.png' - plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path) +# # 将数据点和MD一起聚类 +# import os +# import numpy as np +# import pandas as pd +# from matplotlib import pyplot as plt +# +# from draw_md_cluster import DBSCAN +# from ml_er.ml_entity_resolver import build_col_pairs_sim_tensor_dict +# +# +# def plot(md_keys_, md_data_, pre_match_points_, pre_mismatch_points_, labels_, output_path_): +# clusterNum = len(set(labels_)) +# fig = plt.figure() +# scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown'] +# ax = fig.add_subplot(111, projection='3d') +# for i in range(-1, clusterNum): +# colorStyle = scatterColors[i % len(scatterColors)] +# subCluster = md_data_[np.where(labels_ == i)] +# ax.scatter(subCluster[:, 0], subCluster[:, 1], subCluster[:, 2], c=colorStyle, s=12) +# ax.scatter(pre_match_points_[:, 0], pre_match_points_[:, 1], pre_match_points_[:, 2], c='#66CCFF', s=12, marker='x') +# if pre_mismatch_points_.shape[0] > 0: +# ax.scatter(pre_mismatch_points_[:, 0], pre_mismatch_points_[:, 1], pre_mismatch_points_[:, 2], c='#006666', s=12, marker='x') +# ax.set_xlabel(md_keys_[0], rotation=0) # 设置标签角度 +# ax.set_ylabel(md_keys_[1], rotation=-45) +# ax.set_zlabel(md_keys_[2], rotation=0) +# plt.title(output_path_.split('\\')[-1].split('.')[0]) +# plt.savefig(output_path_, dpi=500) +# plt.show() +# +# +# if __name__ == '__main__': +# outcome_path = r'E:\Data\Research\Outcome' +# config_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5' +# dataset_name_list = [f.name for f in os.scandir(outcome_path) if f.is_dir()] +# for dataset_name in dataset_name_list: +# absolute_path = outcome_path + rf'\{dataset_name}' + config_dir + r'\mds.txt' # MD路径 +# predictions = outcome_path + rf'\{dataset_name}' + config_dir + r'\predictions.csv' # prediction路径 +# pred = pd.read_csv(predictions) +# pred = pred.astype(str) +# # pred = pred[pred['predicted'] == str(1)] +# sim_tensor_dict = build_col_pairs_sim_tensor_dict(pred) +# # 选取的三个字段 +# md_keys = [] +# with open(absolute_path, 'r') as f: +# # 读取每一行的md,加入该文件的md列表 +# md_data = [] +# for line in f.readlines(): +# md_metadata = line.strip().split('\t') +# md_tuple = eval(md_metadata[1]) +# md_keys = list(md_tuple[0].keys())[1:4] +# md_values = list(md_tuple[0].values()) +# md_data.append(md_values[1:4]) +# if len(md_data) == 10000: +# break +# +# pre_match_points = [] +# pre_mismatch_points = [] +# for _ in pred.itertuples(): +# data_point_value = [] +# for key in md_keys: +# sim_tensor = sim_tensor_dict[key] +# data_point_value.append(round(float(sim_tensor[_[0]]), 4)) +# if getattr(_, 'predicted') == str(1): +# pre_match_points.append(data_point_value) +# elif getattr(_, 'predicted') == str(0): +# pre_mismatch_points.append(data_point_value) +# +# md_data = np.array(md_data, dtype=np.float32) +# pre_match_points = np.array(pre_match_points, dtype=np.float32) +# pre_mismatch_points = np.array(pre_mismatch_points, dtype=np.float32) +# labels = DBSCAN(md_data, 0.5, 30) +# output_path = outcome_path + rf'\{dataset_name}_MD&data.png' +# plot(md_keys, md_data, pre_match_points, pre_mismatch_points, labels, output_path) diff --git a/hpo/magellan_hpo.py b/hpo/magellan_hpo.py index a2625a1..3831752 100644 --- a/hpo/magellan_hpo.py +++ b/hpo/magellan_hpo.py @@ -1,12 +1,14 @@ import json import pickle +import time from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction from ConfigSpace.read_and_write import json as csj import py_entitymatching.catalog.catalog_manager as cm import pandas as pd -from smac import HyperparameterOptimizationFacade, Scenario +from colorama import Fore, init +from smac import HyperparameterOptimizationFacade, Scenario, BlackBoxFacade from ml_er.magellan_er import matching from settings import * @@ -61,9 +63,7 @@ class Optimization: def train(self, config: Configuration, seed: int = 0) -> float: cm.del_catalog() - with open(er_output_dir + "blocking_result.pickle", "rb") as file: - blocking_result = pickle.load(file) - indicators = matching(config, blocking_result) + indicators = matching(config) return 1 - indicators['performance'] @@ -73,20 +73,20 @@ def ml_er_hpo(): str_configspace = csj.write(cs) dict_configspace = json.loads(str_configspace) # 将超参数空间保存本地 - with open(hpo_output_dir + "configspace.json", "w") as f: + with open(hpo_output_dir + r"\configspace.json", "w") as f: json.dump(dict_configspace, f, indent=4) scenario = Scenario( cs, crash_cost=1.0, deterministic=True, - n_trials=20, + n_trials=16, n_workers=1 ) - initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) + initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5) - smac = HyperparameterOptimizationFacade( + smac = BlackBoxFacade( scenario, optimization.train, initial_design=initial_design, @@ -97,19 +97,21 @@ def ml_er_hpo(): incumbent_cost = smac.validate(incumbent) default = cs.get_default_configuration() default_cost = smac.validate(default) - print(f"Default Cost: {default_cost}") - print(f"Incumbent Cost: {incumbent_cost}") + print(Fore.BLUE + f"Default Cost: {default_cost}") + print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}") if incumbent_cost > default_cost: incumbent = default - print(f"Updated Incumbent Cost: {default_cost}") + print(Fore.RED + f'Updated Incumbent Cost: {default_cost}') - print(f"Optimized Configuration:{incumbent.values()}") + print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}") - with open(hpo_output_dir + "incumbent.json", "w") as f: + with open(hpo_output_dir + r"\incumbent.json", "w") as f: json.dump(dict(incumbent), f, indent=4) return incumbent if __name__ == '__main__': + init(autoreset=True) + print(Fore.CYAN + f'Start Time: {time.time()}') ml_er_hpo() diff --git a/md_discovery/discovery_executor.py b/md_discovery/discovery_executor.py deleted file mode 100644 index ba01a44..0000000 --- a/md_discovery/discovery_executor.py +++ /dev/null @@ -1,200 +0,0 @@ -import operator - -import numpy as np -import pandas as pd -import copy -import torch -from ConfigSpace import Configuration -from tqdm import tqdm -from settings import model, similarity_threshold, support_threshold, confidence_threshold - - -def is_minimal(md, md_list, target_col): - # 假设这个md是minimal - if len(md_list) == 0: - return True - minimal = True - for _ in md_list: - if isinstance(_, tuple): - _ = _[0] - if _ != md: - other_cols = list(set(_.keys()) - {target_col}) - # 假设列表中每一个md都使当前md不minimal - exist = True - # 如果左边任何一个大于,则假设不成立 - for col in other_cols: - if _[col] > md[col]: - exist = False - break - # 如果右边小于,假设也不成立 - if _[target_col] < md[target_col]: - exist = False - # 任何一次假设成立,当前md不minimal - if exist: - minimal = False - break - return minimal - - -def pairs_inference(path, target_col): - data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') - data.fillna("", inplace=True) - data = data.astype(str) - columns = data.columns.values.tolist() - target_index = columns.index(target_col) - cols_but_target = list(set(columns) - {target_col}) - length = data.shape[0] - width = data.shape[1] - - sentences = [] - for col in range(0, width): - for row in range(0, length): - cell_value = data.values[row, col] - sentences.append(cell_value) - embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") - split_embedding = torch.split(embedding, length, dim=0) - table_tensor = torch.stack(split_embedding, dim=0, out=None) - norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) - sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) - # sim_tensor = torch.round(sim_tensor, decimals=3) - - # torch.save(sim_tensor, md_output_dir + "tensor.pt") - - md_list = [] - minimal_vio = [] - init_md = {} - for col in columns: - init_md[col] = 1 if col == target_col else -1 - md_list.append(init_md) - - for row1 in tqdm(range(0, length - 1)): - terminate = False - for row2 in range(row1 + 1, length): - violated_mds = [] - # sims是两行的相似度 - sims = {} - for col_index in range(0, width): - col = columns[col_index] - similarity = sim_tensor[col_index, row1, row2].item() - sims[col] = similarity - - # 寻找violated md,从md列表中删除并加入vio列表 - for md in md_list[:]: - lhs_satis = True - rhs_satis = True - for col in cols_but_target: - if sims[col] < md[col]: - lhs_satis = False - break - if sims[target_col] < md[target_col]: - rhs_satis = False - if lhs_satis == True and rhs_satis == False: - md_list.remove(md) - violated_mds.append(md) - - # for vio_md in violated_mds: - # # 特殊化左侧 - # for col in cols_but_target: - # if sims[col] + 0.01 <= 1: - # spec_l_md = copy.deepcopy(vio_md) - # spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01 - # if is_minimal(spec_l_md, md_list, target_col): - # md_list.append(spec_l_md) - # if vio_md not in minimal_vio: - # minimal_vio.append(vio_md) - - for vio_md in violated_mds: - vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index) - if vio_md_support >= support_threshold: - for col in cols_but_target: - if sims[col] < 1.0: - spec_l_md = copy.deepcopy(vio_md) - if sims[col] < similarity_threshold: - spec_l_md[col] = similarity_threshold - else: - if sims[col] + 0.01 <= 1.0: - spec_l_md[col] = sims[col] + 0.01 - else: - spec_l_md[col] = 1.0 - if is_minimal(spec_l_md, md_list, target_col): - md_list.append(spec_l_md) - if vio_md not in minimal_vio: - minimal_vio.append(vio_md) - - if len(md_list) == 0: - terminate = True - break - if terminate: - break - - if len(minimal_vio) > 0: - for md in minimal_vio[:]: - support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) - if support >= support_threshold and confidence >= confidence_threshold: - minimal_vio.append((md, support, confidence)) - minimal_vio.remove(md) - - if len(md_list) > 0: - # 去除重复MD - tmp = [] - for _ in md_list: - if _ not in tmp: - tmp.append(_) - md_list = tmp - # 去除support小于阈值MD - for _ in md_list[:]: - support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index) - if support >= support_threshold and confidence >= confidence_threshold: - md_list.append((_, support, confidence)) - md_list.remove(_) - # 去除不minimal的MD - for md_tuple in md_list[:]: - if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5: - md_list.remove(md_tuple) - if len(minimal_vio) > 0: - for vio_tuple in minimal_vio[:]: - if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5: - minimal_vio.remove(vio_tuple) - - if len(minimal_vio) > 0: - for vio_tuple in minimal_vio[:]: - if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5: - minimal_vio.remove(vio_tuple) - - result = [] - result.extend(md_list) - result.extend(minimal_vio) - result.sort(key=operator.itemgetter(2), reverse=True) - print(f'\033[33mList Length: {len(result)}\033[0m') - return result - - -def get_metrics(current_md, data, sim_tensor, target_col, target_index): - columns = data.columns.values.tolist() - length = data.shape[0] - width = data.shape[1] - - md_tensor = list(current_md.values()) - md_tensor = torch.tensor(md_tensor, device='cuda') - md_tensor_2d = md_tensor.unsqueeze(1) - md_tensor_3d = md_tensor_2d.unsqueeze(2) - md_tensor_3d = md_tensor_3d.repeat(1, length, length) - - sim_tensor = torch.round(sim_tensor, decimals=4) - sup_tensor = torch.ge(sim_tensor, md_tensor_3d) - ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') - for i in range(0, width): - if i != target_index: - sup_tensor_slice = sup_tensor[i] - ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) - sup_tensor_int = ini_slice.int() - support_Naumann = torch.count_nonzero(sup_tensor_int).item() - support_Naumann = (support_Naumann - length) / 2 - - ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index]) - conf_tensor_int = ini_slice.int() - support_Fan = torch.count_nonzero(conf_tensor_int).item() - support_Fan = (support_Fan - length) / 2 - confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0 - - return support_Fan, confidence diff --git a/md_discovery/discovery_executor_gpu.py b/md_discovery/discovery_executor_gpu.py deleted file mode 100644 index df8774f..0000000 --- a/md_discovery/discovery_executor_gpu.py +++ /dev/null @@ -1,138 +0,0 @@ -import math -import operator -import random -import time -from tqdm import tqdm -import numpy as np -import pandas as pd -import torch - -from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir - -sample_number = 100000 -step_length = 0.01 - -def get_metrics(md_tensor, data, sim_tensor, target_index): - length = data.shape[0] - width = data.shape[1] - - # md_tensor = list(current_md.values()) - # md_tensor = torch.tensor(md_tensor, device='cuda') - md_tensor_2d = md_tensor.unsqueeze(1) - md_tensor_3d = md_tensor_2d.unsqueeze(2) - md_tensor_3d = md_tensor_3d.repeat(1, length, length) - - sim_tensor = torch.round(sim_tensor, decimals=4) - - sup_tensor = torch.ge(sim_tensor, md_tensor_3d) - ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') - for i in range(0, width): - if i != target_index: - sup_tensor_slice = sup_tensor[i] - ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) - sup_tensor_int = ini_slice.int() - support_Naumann = torch.count_nonzero(sup_tensor_int).item() - support_Naumann = (support_Naumann - length) / 2 - - conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index]) - conf_tensor_int = conf_tensor.int() - support_Fan = torch.count_nonzero(conf_tensor_int).item() - support_Fan = (support_Fan - length) / 2 - confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0 - - return support_Fan, confidence - - -def build_cartesian(width, target_index): - all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True, - num=math.ceil((1-similarity_threshold)/step_length) + 1) - all_values_array = np.round(all_values_array, 4) - all_values_tensor = torch.tensor(all_values_array, device='cuda') - all_values_tensor = all_values_tensor.float() - all_values_tensor = torch.round(all_values_tensor, decimals=4) - tensors_for_cartesian = [] - for i in range(0, width): - if i == target_index: - t = torch.tensor([1.0], device='cuda') - tensors_for_cartesian.append(t) - else: - tensors_for_cartesian.append(all_values_tensor) - result = torch.cartesian_prod(*tensors_for_cartesian) - return result - - -def get_metric_tensor(cartesian_product, data, sim_tensor, target_index): - length = data.shape[0] - width = data.shape[1] - cartesian_product = cartesian_product.unsqueeze(2) - cartesian_product = cartesian_product.unsqueeze(3) - cartesian_product = cartesian_product.repeat(1, 1, length, length) - - -def discover(path, target_col): - data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') - data.fillna("", inplace=True) - data = data.astype(str) - columns = data.columns.values.tolist() - target_index = columns.index(target_col) - cols_but_target = list(set(columns) - {target_col}) - length = data.shape[0] - width = data.shape[1] - # 除了目标列外所有列的索引 - columns_indices = [_ for _ in range(0, width) if _ != target_index] - - sentences = [] - for col in range(0, width): - for row in range(0, length): - cell_value = data.values[row, col] - sentences.append(cell_value) - embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") - split_embedding = torch.split(embedding, length, dim=0) - table_tensor = torch.stack(split_embedding, dim=0, out=None) - norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) - sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) - sim_tensor = sim_tensor.float() - sim_tensor = torch.round(sim_tensor, decimals=4) - - # 小于6列的可以尝试做笛卡尔积,大于6列可能指数爆炸 - if width < 6: - # 列出除目标列以外所有列的所有取值,做笛卡尔积,结果为所有可能MD取值 - cartesian = build_cartesian(width, target_index) - # 抽取sample_number / (width - 1)条MD,不含-1 - if cartesian.shape[0] > sample_number / (width - 1): - index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda') - cartesian = torch.index_select(cartesian, 0, index) - else: - # 随机生成sample_number / (width - 1)条MD,使用randint先转化为int再除成小数,不含-1 - cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100, - (math.ceil(sample_number / (width - 1)), width - 1), device='cuda') - cartesian = cartesian / 100 - # 生成一列相似度为1的目标列,插入目标列所在位置 - ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda') - cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1) - cartesian = torch.round(cartesian, decimals=4) - # 此tensor将与其他置为-1的tensor拼接 - joint_md_tensor = cartesian.clone() - # 随机将1列,2列……置为-1 - for i in range(width - 2): - index_list_format = [] - for j in range(cartesian.shape[0]): - # 对每条MD,随机选择将要置为-1的列索引 - index_list_format.append(random.sample(columns_indices, i + 1)) - index = torch.tensor(index_list_format, device='cuda') - # 随机调整为-1后的MD集合 - modified_cartesian = cartesian.scatter(1, index, -1) - joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0) - - md_list = [] - # get_metric_tensor(cartesian, data, sim_tensor, target_index) - for _ in tqdm(range(joint_md_tensor.shape[0])): - s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index) - if s >= support_threshold and c >= confidence_threshold: - md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()] - md_dict_format = {} - for k in range(0, width): - md_dict_format[columns[k]] = md_list_format[k] - md_list.append((md_dict_format, s, c)) - md_list.sort(key=operator.itemgetter(2), reverse=True) - return md_list diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py deleted file mode 100644 index 9b74450..0000000 --- a/md_discovery/md_discover.py +++ /dev/null @@ -1,27 +0,0 @@ -from ConfigSpace import Configuration - -from md_discovery.discovery_executor import pairs_inference -from md_discovery.discovery_executor_gpu import discover -from settings import * - -# # 若不输出support和confidence,使用以下两块代码 -# # 将列表1写入本地,路径需自己修改 -# md_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_md_list.txt' -# with open(md_path, 'w') as f: -# for _ in mds: -# f.write(str(_) + '\n') -# -# # 将列表2写入本地,路径需自己修改 -# vio_path = '/home/w/A-New Folder/8.14/Paper Dataset/TP_vio_list.txt' -# with open(vio_path, 'w') as f: -# for _ in vio: -# f.write(str(_) + '\n') - - -def md_discover(config: Configuration, source_path, target_path): - mds_list = discover(source_path, target_attr) - with open(target_path, 'w') as f: - for _ in mds_list: - f.write('Target:'+str(target_attr) + '\t') - f.write(str(_)) - f.write('\n') diff --git a/md_discovery/md_mining.py b/md_discovery/md_mining.py index d0f0c72..941e7f6 100644 --- a/md_discovery/md_mining.py +++ b/md_discovery/md_mining.py @@ -1,4 +1,5 @@ import itertools +import pickle import random import operator from operator import itemgetter @@ -16,10 +17,12 @@ from settings import * def mining(train: pd.DataFrame): # data is train set, in which each row represents a tuple pair train = train.astype(str) + # 将label列移到最后 + train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1) # 尝试不将左右表key手动调整相同,而是只看gold属性是否为1 # 故将左右表key直接去除 - data = train.drop(columns=['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id], inplace=False) + data = train.drop(columns=['_id', 'ltable_id', 'rtable_id'], inplace=False) # data中现存属性:除key以外左右表属性和gold, 不含_id columns = data.columns.values.tolist() columns_without_prefix = [_.replace('ltable_', '') for _ in columns if _.startswith('ltable_')] @@ -62,7 +65,7 @@ def mining(train: pd.DataFrame): # 生成带标签的相似度张量 sim_table_tensor_labeled = torch.cat((sim_table_tensor, label_tensor), 1) # 找到匹配元组对的行索引 - mask = (data['gold'].isin(['1'])) + mask = (data['label'].isin(['1'])) match_pair_indices = data[mask].index.tolist() # 根据索引将匹配的行标签置为1 sim_table_tensor_labeled[match_pair_indices, -1] = 1.00 @@ -209,7 +212,7 @@ def build_candidate_md_matrix(sorted_unique_value_tensor_list_: list): def mds_to_txt(result_list_): - p = md_output_dir + "mds.txt" + p = md_output_dir + r"\mds.txt" with open(p, 'w') as f: for _ in result_list_: f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}') @@ -253,3 +256,10 @@ def merge_mds(md_list_): del same_sc_list[index] # 二级列表转一级列表 return list(itertools.chain.from_iterable(grouped_md_tuples)) + + +if __name__ == '__main__': + _train = pd.read_csv(directory_path + r'\train_whole.csv') + result = mining(_train) + with open(md_output_dir + r"\mds.pickle", "wb") as file_: + pickle.dump(result, file_) diff --git a/ml_er/magellan_er.py b/ml_er/magellan_er.py index 9ed7361..4f1148b 100644 --- a/ml_er/magellan_er.py +++ b/ml_er/magellan_er.py @@ -11,89 +11,20 @@ from ConfigSpace import Configuration from ConfigSpace.read_and_write import json as csj import py_entitymatching.catalog.catalog_manager as cm from tqdm import tqdm +from colorama import Fore -from md_discovery.md_mining import mining from settings import * -def blocking_mining(): - start = time.time() - ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') - cm.set_key(ltable, ltable_id) - rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') - cm.set_key(rtable, rtable_id) - mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1') - matching_number = len(mappings) - # if ltable_id == rtable_id: - # tables_id = rtable_id - attributes = ltable.columns.values.tolist() - # lattributes = ['ltable_' + i for i in attributes] - # rattributes = ['rtable_' + i for i in attributes] - cm.set_key(ltable, ltable_id) - cm.set_key(rtable, rtable_id) +def matching(config: Configuration): + print(Fore.BLUE + f'Config: {config}') + with open(md_output_dir + r"\mds.pickle", "rb") as file: + md_list = pickle.load(file) - blocker = em.OverlapBlocker() - candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True, - l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1, - overlap_size=1, show_progress=False) - candidate['gold'] = 0 - candidate = candidate.reset_index(drop=True) - block_time = time.time() - print(f'Block Time: {block_time - start}') - - # 根据mapping表标注数据 - candidate_match_rows = [] - for t in tqdm(mappings.itertuples()): - mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, mapping_lid)])) & - (candidate['rtable_' + rtable_id].isin([getattr(t, mapping_rid)]))) - matching_indices = candidate[mask].index - candidate_match_rows.extend(matching_indices.tolist()) - match_rows_mask = candidate.index.isin(candidate_match_rows) - candidate.loc[match_rows_mask, 'gold'] = 1 - candidate.fillna(value="", inplace=True) - - # negative样本太多, 采样三倍于positive样本量 - candidate_mismatch = candidate[candidate['gold'] == 0] - candidate_match = candidate[candidate['gold'] == 1] - candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match)) - candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match]) - # 如果拼接后不重设索引可能导致索引重复 - candidate_for_train_test = candidate_for_train_test.reset_index(drop=True) - cm.set_key(candidate_for_train_test, '_id') - cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id) - cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id) - cm.set_ltable(candidate_for_train_test, ltable) - cm.set_rtable(candidate_for_train_test, rtable) - block_recall = len(candidate_match) / matching_number - - # 分为训练测试集 - train_proportion = 0.5 - sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0) - train_set = sets['train'] - test_set = sets['test'] - label_and_split_time = time.time() - print(f'Label and Split Time: {label_and_split_time - block_time}') - - # 挖掘MD并保存本地 - md_list = mining(train_set) - mining_time = time.time() - print(f'Mining Time: {mining_time - label_and_split_time}') - blocking_results = (ltable, rtable, train_set, test_set, md_list, block_recall) - # 将blocking结果保存到本地 - with open(er_output_dir + "blocking_result.pickle", "wb") as file_: - pickle.dump(blocking_results, file_) - return blocking_results - - -def matching(config: Configuration, blocking_result_): - print(f'\033[33mConfig: {config}\033[0m') - start = time.time() - ltable = blocking_result_[0] - rtable = blocking_result_[1] - train_set = blocking_result_[2] - test_set = blocking_result_[3] - md_list = blocking_result_[4] - block_recall = blocking_result_[5] + train_set = pd.read_csv(directory_path + r'\train_whole.csv', encoding='ISO-8859-1') + test_set = pd.read_csv(directory_path + r'\test_whole.csv', encoding='ISO-8859-1') + ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1') + rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1') ml_matcher = config["ml_matcher"] match ml_matcher: case "dt": @@ -109,22 +40,22 @@ def matching(config: Configuration, blocking_result_): max_features=config['rf_max_features']) cm.set_key(train_set, '_id') - cm.set_fk_ltable(train_set, 'ltable_' + ltable_id) - cm.set_fk_rtable(train_set, 'rtable_' + rtable_id) + cm.set_fk_ltable(train_set, 'ltable_id') + cm.set_fk_rtable(train_set, 'rtable_id') cm.set_ltable(train_set, ltable) cm.set_rtable(train_set, rtable) - cm.set_key(ltable, ltable_id) - cm.set_key(rtable, rtable_id) + cm.set_key(ltable, 'id') + cm.set_key(rtable, 'id') cm.set_key(test_set, '_id') - cm.set_fk_ltable(test_set, 'ltable_' + ltable_id) - cm.set_fk_rtable(test_set, 'rtable_' + rtable_id) + cm.set_fk_ltable(test_set, 'ltable_id') + cm.set_fk_rtable(test_set, 'rtable_id') cm.set_ltable(test_set, ltable) cm.set_rtable(test_set, rtable) feature_table = em.get_features_for_matching(ltable, rtable, validate_inferred_attr_types=False) train_feature_vecs = em.extract_feature_vecs(train_set, feature_table=feature_table, - attrs_after=['gold'], + attrs_after=['label'], show_progress=False) train_feature_vecs.fillna(value=0, inplace=True) @@ -132,22 +63,21 @@ def matching(config: Configuration, blocking_result_): for _ in test_feature_after[:]: test_feature_after.append(_.replace('ltable_', 'rtable_')) for _ in test_feature_after: - if _.endswith(ltable_id) or _.endswith(rtable_id): + if _.endswith('id'): test_feature_after.remove(_) - test_feature_after.append('gold') + test_feature_after.append('label') test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, attrs_after=test_feature_after, show_progress=False) test_feature_vecs.fillna(value=0, inplace=True) - fit_exclude = ['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id, 'gold'] - matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') - test_feature_after.extend(['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id]) + fit_exclude = ['_id', 'ltable_id', 'rtable_id', 'label'] + matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='label') + test_feature_after.extend(['_id', 'ltable_id', 'rtable_id']) predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after, append=True, target_attr='predicted', inplace=False) - eval_result = em.eval_matches(predictions, 'gold', 'predicted') + eval_result = em.eval_matches(predictions, 'label', 'predicted') em.print_eval_summary(eval_result) - indicators = evaluate_prediction(predictions, 'gold', 'predicted') - indicators['block_recall'] = block_recall + indicators = evaluate_prediction(predictions, 'label', 'predicted') test_feature_after.remove('_id') test_feature_after.append('predicted') @@ -158,31 +88,26 @@ def matching(config: Configuration, blocking_result_): # 目前predictions包含的属性:左右表全部属性+gold+predicted sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) predictions['confidence'] = 0 + predictions['md'] = '' epl_match = 0 # 可解释,预测match if len(md_list) > 0: for row in tqdm(predictions.itertuples()): - x = is_explicable(row, md_list, sim_tensor_dict) - if x > 0 and str(getattr(row, 'predicted')) == str(1): - predictions.loc[row[0], 'confidence'] = x - epl_match += 1 + if str(getattr(row, 'predicted')) == str(1): + conf, md_dict = is_explicable(row, md_list, sim_tensor_dict) + if conf > 0: + predictions.loc[row[0], 'confidence'] = conf + predictions.loc[row[0], 'md'] = str(md_dict) + epl_match += 1 df = predictions[predictions['predicted'] == str(1)] interpretability = epl_match / len(df) # 可解释性 indicators['interpretability'] = interpretability - - # note 既然不调block参数, 不妨假设block_recall很高, 不必考虑 - # if indicators["block_recall"] < indicators["recall"]: - # f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / ( - # indicators["precision"] + indicators["block_recall"]) - # else: - # f1 = indicators["F1"] - performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"] indicators['performance'] = performance - print(f'ER Indicators: {indicators}') - predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True) - print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m') + print(Fore.BLUE + f'ER Indicators: {indicators}') + predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True) + print(Fore.CYAN + f'Finish Time: {time.time()}') return indicators @@ -223,7 +148,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame): col_tuple_list.append((left_index, right_index)) length = predictions.shape[0] - width = predictions.shape[1] + # width = predictions.shape[1] predictions = predictions.reset_index(drop=True) sentences = predictions.values.flatten(order='F').tolist() @@ -238,7 +163,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame): rattr_tensor = norm_table_tensor[col_tuple[1]] mul_tensor = lattr_tensor * rattr_tensor sim_tensor = torch.sum(mul_tensor, 1) - sim_tensor = torch.round(sim_tensor, decimals=4) + sim_tensor = torch.round(sim_tensor, decimals=2) sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor return sim_tensor_dict @@ -252,31 +177,28 @@ def is_explicable(row, all_mds: list, st_dict): explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 break # 不再与当前md的其他相似度阈值比较,跳转到下一条md if explicable: - return md_tuple[2] # 任意一条md能解释,直接返回 - return -1.0 # 遍历结束,不能解释 + return md_tuple[2], md_tuple[0] # 任意一条md能解释,直接返回 + return -1.0, {} # 遍历结束,不能解释 -def ml_er(config: Configuration, blocking_result_): - indicators = matching(config, blocking_result_) - output_path = er_output_dir + "eval_result.txt" +def ml_er(config: Configuration): + indicators = matching(config) + output_path = er_output_dir + r"\eval_result.txt" with open(output_path, 'w') as _f: _f.write('Precision:' + str(indicators["precision"]) + '\n') _f.write('Recall:' + str(indicators["recall"]) + '\n') _f.write('F1:' + str(indicators["F1"]) + '\n') - _f.write('block_recall:' + str(indicators["block_recall"]) + '\n') _f.write('interpretability:' + str(indicators['interpretability']) + '\n') _f.write('performance:' + str(indicators['performance']) + '\n') if __name__ == '__main__': - if os.path.isfile(hpo_output_dir + "incumbent.json"): - with open(hpo_output_dir + "configspace.json", 'r') as f: + if os.path.isfile(hpo_output_dir + r"\incumbent.json"): + with open(hpo_output_dir + r"\configspace.json", 'r') as f: dict_configspace = json.load(f) str_configspace = json.dumps(dict_configspace) configspace = csj.read(str_configspace) - with open(hpo_output_dir + "incumbent.json", 'r') as f: + with open(hpo_output_dir + r"\incumbent.json", 'r') as f: dic = json.load(f) configuration = ConfigSpace.Configuration(configspace, values=dic) - with open(er_output_dir + "blocking_result.pickle", "rb") as file: - blocking_result = pickle.load(file) - ml_er(configuration, blocking_result) + ml_er(configuration) diff --git a/ml_er/magellan_start.py b/ml_er/magellan_start.py deleted file mode 100644 index 838ce17..0000000 --- a/ml_er/magellan_start.py +++ /dev/null @@ -1,4 +0,0 @@ -from ml_er.magellan_er import blocking_mining - -if __name__ == '__main__': - blocking_mining() diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py deleted file mode 100644 index c65847e..0000000 --- a/ml_er/ml_entity_resolver.py +++ /dev/null @@ -1,363 +0,0 @@ -import json -import os -import sys -import time - -import ConfigSpace -import pandas -import torch -from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric -from ConfigSpace.read_and_write import json as csj -import py_entitymatching as em -import py_entitymatching.catalog.catalog_manager as cm -import pandas as pd -import six -from ConfigSpace import Configuration -from tqdm import tqdm - -from md_discovery.md_discover import md_discover -from settings import * - - -def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"): - df = train[train['gold'] == 1] - # 元组对左右ID调整一致 - for index, row in df.iterrows(): - df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] - - train_columns = train.columns.values.tolist() - l_columns = [] - r_columns = [] - cols = [] - # 左表和右表字段名分别加入两个列表 - for _ in train_columns: - if _.startswith('ltable'): - l_columns.append(_) - elif _.startswith('rtable'): - r_columns.append(_) - # 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致) - for _ in l_columns: - cols.append(_.replace('ltable_', '')) - - ldf = df[l_columns] - rdf = df[r_columns] - ldf.columns = cols - rdf.columns = cols - t_single_tuple = pd.concat([ldf, rdf]) - t_single_tuple = t_single_tuple.reset_index(drop=True) - - t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1) - - -def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, - candidate: pd.DataFrame) -> dict: - new_df = df.reset_index(drop=False, inplace=False) - gold = new_df[labeled_attr] - predicted = new_df[predicted_attr] - gold_negative = gold[gold == 0].index.values - gold_positive = gold[gold == 1].index.values - predicted_negative = predicted[predicted == 0].index.values - predicted_positive = predicted[predicted == 1].index.values - - false_positive_indices = list(set(gold_negative).intersection(predicted_positive)) - true_positive_indices = list(set(gold_positive).intersection(predicted_positive)) - false_negative_indices = list(set(gold_positive).intersection(predicted_negative)) - - num_true_positives = float(len(true_positive_indices)) - num_false_positives = float(len(false_positive_indices)) - num_false_negatives = float(len(false_negative_indices)) - - precision_denominator = num_true_positives + num_false_positives - recall_denominator = num_true_positives + num_false_negatives - - precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator - recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator - F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall) - block_recall = len(candidate[candidate['gold'] == 1]) / matching_number - - return {"precision": precision, "recall": recall, "F1": F1, "block_recall": block_recall} - - -def load_mds(paths: list) -> list: - if len(paths) == 0: - return [] - all_mds = [] - # 传入md路径列表 - for md_path in paths: - if not os.path.exists(md_path): - continue - mds = [] - # 打开每一个md文件 - with open(md_path, 'r') as f: - # 读取每一行的md,加入该文件的md列表 - for line in f.readlines(): - md_metadata = line.strip().split('\t') - # todo 如果MD文件的形式改了 这里也要改 - md = eval(md_metadata[1]) - mds.append(md) - all_mds.extend(mds) - return all_mds - - -def is_explicable(row, all_mds: list, st_dict): - attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段 - for md_tuple in all_mds: - explicable = True # 假设这条md能解释当前元组 - for a in attrs: - if a != target_attr: - if st_dict[a][row[0]].item() < md_tuple[0][a]: - explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 - break # 不再与当前md的其他相似度阈值比较,跳转到下一条md - if explicable: - return md_tuple[2] # 任意一条md能解释,直接返回 - return -1.0 # 遍历结束,不能解释 - - -# 形成一个字典,key为字段名称,value为一维张量,记录了预测表中这一字段每行的左右属性的相似度 -def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame): - predictions_attrs = predictions.columns.values.tolist() - col_tuple_list = [] - for _ in predictions_attrs: - if _.startswith('ltable'): - left_index = predictions_attrs.index(_) - right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_')) - col_tuple_list.append((left_index, right_index)) - - length = predictions.shape[0] - width = predictions.shape[1] - sentences = [] - for col in range(0, width): - for row in range(0, length): - cell_value = predictions.values[row, col] - sentences.append(cell_value) - embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") - split_embedding = torch.split(embedding, length, dim=0) - table_tensor = torch.stack(split_embedding, dim=0, out=None) - # prediction的归一化嵌入张量 - norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) - sim_tensor_dict = {} - for col_tuple in col_tuple_list: - lattr_tensor = norm_table_tensor[col_tuple[0]] - rattr_tensor = norm_table_tensor[col_tuple[1]] - mul_tensor = lattr_tensor * rattr_tensor - sim_tensor = torch.sum(mul_tensor, 1) - sim_tensor = torch.round(sim_tensor, decimals=4) - sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor - return sim_tensor_dict - - -def er_process(config: Configuration): - start = time.time() - ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') - cm.set_key(ltable, ltable_id) - # ltable.fillna("", inplace=True) - rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') - cm.set_key(rtable, rtable_id) - # rtable.fillna("", inplace=True) - mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1') - - # 仅保留两表中出现在映射表中的行,增大正样本比例 - lid_mapping_list = [] - rid_mapping_list = [] - # 全部转为字符串 - # ltable = ltable.astype(str) - # rtable = rtable.astype(str) - # mappings = mappings.astype(str) - matching_number = len(mappings) # 所有阳性样本数 - - for index, row in mappings.iterrows(): - lid_mapping_list.append(row[mapping_lid]) - rid_mapping_list.append(row[mapping_rid]) - - selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] - tables_id = rtable_id - selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] - selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 - attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] - attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] - cm.set_key(selected_ltable, tables_id) - cm.set_key(selected_rtable, tables_id) - - blocker = em.OverlapBlocker() - candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], - config["block_attr"], allow_missing=True, - l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, - overlap_size=1, show_progress=False) - - candidate['gold'] = 0 - candidate = candidate.reset_index(drop=True) - candidate_match_rows = [] - for row in candidate.itertuples(): - l_id = getattr(row, 'ltable_' + tables_id) - map_row = mappings[mappings[mapping_lid] == l_id] - - if map_row is not None: - r_id = map_row[mapping_rid] - for value in r_id: - if value == getattr(row, 'rtable_' + tables_id): - candidate_match_rows.append(row[0]) - else: - continue - - for _ in candidate_match_rows: - candidate.loc[_, 'gold'] = 1 - - candidate.fillna(value="", inplace=True) - - # 裁剪负样本,保持正负样本数量一致 - candidate_mismatch = candidate[candidate['gold'] == 0] - candidate_match = candidate[candidate['gold'] == 1] - if len(candidate_mismatch) > len(candidate_match): - candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match)) - # 拼接正负样本 - candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match]) - # if len(candidate_for_train_test) == 0: - # return 0 - # 如果拼接后不重设索引可能导致索引重复 - candidate_for_train_test = candidate_for_train_test.reset_index(drop=True) - cm.set_key(candidate_for_train_test, '_id') - cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id) - cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id) - cm.set_ltable(candidate_for_train_test, selected_ltable) - cm.set_rtable(candidate_for_train_test, selected_rtable) - block_recall = len(candidate_match) / matching_number - - # 分为训练测试集 - train_proportion = 0.7 - sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0) - train_set = sets['train'] - test_set = sets['test'] - - # cm.set_key(train_set, '_id') - # cm.set_fk_ltable(train_set, 'ltable_' + tables_id) - # cm.set_fk_rtable(train_set, 'rtable_' + tables_id) - # cm.set_ltable(train_set, selected_ltable) - # cm.set_rtable(train_set, selected_rtable) - # - # cm.set_key(test_set, '_id') - # cm.set_fk_ltable(test_set, 'ltable_' + tables_id) - # cm.set_fk_rtable(test_set, 'rtable_' + tables_id) - # cm.set_ltable(test_set, selected_ltable) - # cm.set_rtable(test_set, selected_rtable) - - ml_matcher = config["ml_matcher"] - if ml_matcher == "dt": - matcher = em.DTMatcher(name='DecisionTree', random_state=0) - elif ml_matcher == "svm": - matcher = em.SVMMatcher(name='SVM', random_state=0) - elif ml_matcher == "rf": - matcher = em.RFMatcher(name='RF', random_state=0) - elif ml_matcher == "lg": - matcher = em.LogRegMatcher(name='LogReg', random_state=0) - elif ml_matcher == "ln": - matcher = em.LinRegMatcher(name='LinReg') - elif ml_matcher == "nb": - matcher = em.NBMatcher(name='NaiveBayes') - - feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False) - - train_feature_vecs = em.extract_feature_vecs(train_set, - feature_table=feature_table, - attrs_after=['gold'], - show_progress=False) - train_feature_vecs.fillna(value=0, inplace=True) - - test_feature_after = attrs_with_l_prefix[:] - test_feature_after.extend(attrs_with_r_prefix) - for _ in test_feature_after: - if _.endswith(tables_id): - test_feature_after.remove(_) - test_feature_after.append('gold') - test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, - attrs_after=test_feature_after, show_progress=False) - test_feature_vecs.fillna(value=0, inplace=True) - - fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold'] - matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') - test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id]) - predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after, - append=True, target_attr='predicted', inplace=False) - eval_result = em.eval_matches(predictions, 'gold', 'predicted') - em.print_eval_summary(eval_result) - indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, candidate_for_train_test) - - # 计算可解释性 - ################################################################################################################ - predictions_attrs = [] - predictions_attrs.extend(attrs_with_l_prefix) - predictions_attrs.extend(attrs_with_r_prefix) - predictions_attrs.extend(['gold', 'predicted']) - predictions = predictions[predictions_attrs] - # 必须从训练集内挖MD - train_attrs = predictions_attrs[:] - train_attrs.remove('predicted') - train_set = train_set[train_attrs] - prepare_file_for_md_discovery(train_set) - predictions = predictions.reset_index(drop=True) - predictions = predictions.astype(str) - sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) - predictions['confidence'] = 0 - - md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt") - md_paths = [md_output_dir + 'mds.txt'] - md_list = load_mds(md_paths) # 从全局变量中读取所有的md - epl_match = 0 # 可解释,预测match - - unexplainable = pd.DataFrame() - - if len(md_list) > 0: - for row in tqdm(predictions.itertuples()): - x = is_explicable(row, md_list, sim_tensor_dict) - if x > 0 and str(getattr(row, 'predicted')) == str(1): - predictions.loc[row[0], 'confidence'] = x - epl_match += 1 - # else: - # series = pd.Series(row) - # unexplainable = unexplainable._append(series, ignore_index=True) - # unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True) - # unexplainable.columns = predictions_attrs - # unexplainable = unexplainable[train_attrs] - # if len(unexplainable[unexplainable['gold'] == str(1)]) > 0: - # prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv') - # md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt") - - df = predictions[predictions['predicted'] == str(1)] - interpretability = epl_match / len(df) # 可解释性 - indicators['interpretability'] = interpretability - if indicators["block_recall"] < indicators["recall"]: - f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / ( - indicators["precision"] + indicators["block_recall"]) - else: - f1 = indicators["F1"] - performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 - indicators['performance'] = performance - indicators['eval_result'] = eval_result - print(indicators) - predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True) - ################################################################################################################ - print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m') - return indicators - - -def ml_er(config: Configuration = None): - indicators = er_process(config) - output_path = er_output_dir + "eval_result.txt" - with open(output_path, 'w') as f: - for key, value in six.iteritems(_get_metric(indicators['eval_result'])): - f.write(key + " : " + value) - f.write('\n') - f.write('block_recall:' + str(indicators["block_recall"]) + '\n') - f.write('interpretability:' + str(indicators['interpretability']) + '\n') - f.write('performance:' + str(indicators['performance']) + '\n') - - -if __name__ == '__main__': - if os.path.isfile(hpo_output_dir + "incumbent.json"): - with open(hpo_output_dir + "configspace.json", 'r') as f: - dict_configspace = json.load(f) - str_configspace = json.dumps(dict_configspace) - configspace = csj.read(str_configspace) - with open(hpo_output_dir + "incumbent.json", 'r') as f: - dic = json.load(f) - configuration = ConfigSpace.Configuration(configspace, values=dic) - ml_er(configuration) diff --git a/settings.py b/settings.py index 00eb5bc..c657684 100644 --- a/settings.py +++ b/settings.py @@ -1,24 +1,12 @@ from sentence_transformers import SentenceTransformer -ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv' -rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv' -mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv' -mapping_lid = 'idAbt' # mapping表中左表id名 -mapping_rid = 'idBuy' # mapping表中右表id名 -ltable_block_attr = 'name' -rtable_block_attr = 'name' -ltable_id = 'id' # 左表id字段名称 -rtable_id = 'id' # 右表id字段名称 -target_attr = 'id' # 进行md挖掘时的目标字段 -# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 +directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon' -model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2') +er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\ml_er\output' +md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\md_discovery\output' +hpo_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_magellan\hpo\output' + +model = SentenceTransformer(r'E:\Data\Research\Models\all-MiniLM-L6-v2') interpre_weight = 0 # 可解释性权重 -similarity_threshold = 0.1 support_threshold = 1 confidence_threshold = 0.75 - -er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\' -md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\' -hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\' -