You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
matching_dependency/ml_er/magellan_new.py

282 lines
13 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import json
import os
import pickle
import time
import ConfigSpace
import pandas as pd
import py_entitymatching as em
import torch
from ConfigSpace import Configuration
from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm
from tqdm import tqdm
from md_discovery.md_mining import mining
from settings import *
def blocking_mining():
start = time.time()
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
matching_number = len(mappings)
# if ltable_id == rtable_id:
# tables_id = rtable_id
attributes = ltable.columns.values.tolist()
# lattributes = ['ltable_' + i for i in attributes]
# rattributes = ['rtable_' + i for i in attributes]
cm.set_key(ltable, ltable_id)
cm.set_key(rtable, rtable_id)
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
overlap_size=1, show_progress=False)
candidate['gold'] = 0
candidate = candidate.reset_index(drop=True)
block_time = time.time()
print(f'Block Time: {block_time - start}')
# 根据mapping表标注数据
candidate_match_rows = []
for t in tqdm(mappings.itertuples()):
mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, mapping_lid)])) &
(candidate['rtable_' + rtable_id].isin([getattr(t, mapping_rid)])))
matching_indices = candidate[mask].index
candidate_match_rows.extend(matching_indices.tolist())
match_rows_mask = candidate.index.isin(candidate_match_rows)
candidate.loc[match_rows_mask, 'gold'] = 1
candidate.fillna(value="", inplace=True)
# negative样本太多, 采样三倍于positive样本量
candidate_mismatch = candidate[candidate['gold'] == 0]
candidate_match = candidate[candidate['gold'] == 1]
candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
# 如果拼接后不重设索引可能导致索引重复
candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
cm.set_key(candidate_for_train_test, '_id')
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id)
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id)
cm.set_ltable(candidate_for_train_test, ltable)
cm.set_rtable(candidate_for_train_test, rtable)
block_recall = len(candidate_match) / matching_number
# 分为训练测试集
train_proportion = 0.5
sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
train_set = sets['train']
test_set = sets['test']
label_and_split_time = time.time()
print(f'Label and Split Time: {label_and_split_time - block_time}')
# 挖掘MD并保存本地
md_list = mining(train_set)
mining_time = time.time()
print(f'Mining Time: {mining_time - label_and_split_time}')
blocking_results = (ltable, rtable, train_set, test_set, md_list, block_recall)
# 将blocking结果保存到本地
with open(er_output_dir + "blocking_result.pickle", "wb") as file_:
pickle.dump(blocking_results, file_)
return blocking_results
def matching(config: Configuration, blocking_result_):
print(f'\033[33mConfig: {config}\033[0m')
start = time.time()
ltable = blocking_result_[0]
rtable = blocking_result_[1]
train_set = blocking_result_[2]
test_set = blocking_result_[3]
md_list = blocking_result_[4]
block_recall = blocking_result_[5]
ml_matcher = config["ml_matcher"]
match ml_matcher:
case "dt":
matcher = em.DTMatcher(name='DecisionTree', random_state=0, criterion=config['tree_criterion'],
max_depth=config['tree_max_depth'], splitter=config['dt_splitter'],
max_features=config['dt_max_features'])
case "svm":
matcher = em.SVMMatcher(name='SVM', random_state=0, kernel=config['svm_kernel'], degree=config['svm_degree'],
gamma=config['svm_gamma'], C=config['svm_C'], coef0=config['svm_constant'])
case "rf":
matcher = em.RFMatcher(name='RandomForest', random_state=0, criterion=config['tree_criterion'],
max_depth=config['tree_max_depth'], n_estimators=config['number_of_tree'],
max_features=config['rf_max_features'])
cm.set_key(train_set, '_id')
cm.set_fk_ltable(train_set, 'ltable_' + ltable_id)
cm.set_fk_rtable(train_set, 'rtable_' + rtable_id)
cm.set_ltable(train_set, ltable)
cm.set_rtable(train_set, rtable)
cm.set_key(ltable, ltable_id)
cm.set_key(rtable, rtable_id)
cm.set_key(test_set, '_id')
cm.set_fk_ltable(test_set, 'ltable_' + ltable_id)
cm.set_fk_rtable(test_set, 'rtable_' + rtable_id)
cm.set_ltable(test_set, ltable)
cm.set_rtable(test_set, rtable)
feature_table = em.get_features_for_matching(ltable, rtable, validate_inferred_attr_types=False)
train_feature_vecs = em.extract_feature_vecs(train_set,
feature_table=feature_table,
attrs_after=['gold'],
show_progress=False)
train_feature_vecs.fillna(value=0, inplace=True)
test_feature_after = ['ltable_' + i for i in ltable.columns.values.tolist()]
for _ in test_feature_after[:]:
test_feature_after.append(_.replace('ltable_', 'rtable_'))
for _ in test_feature_after:
if _.endswith(ltable_id) or _.endswith(rtable_id):
test_feature_after.remove(_)
test_feature_after.append('gold')
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=test_feature_after, show_progress=False)
test_feature_vecs.fillna(value=0, inplace=True)
fit_exclude = ['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id, 'gold']
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
test_feature_after.extend(['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id])
predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'gold', 'predicted')
em.print_eval_summary(eval_result)
indicators = evaluate_prediction(predictions, 'gold', 'predicted')
indicators['block_recall'] = block_recall
test_feature_after.remove('_id')
test_feature_after.append('predicted')
predictions = predictions[test_feature_after]
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0
epl_match = 0 # 可解释预测match
if len(md_list) > 0:
for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x
epl_match += 1
df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability
# note 既然不调block参数, 不妨假设block_recall很高, 不必考虑
# if indicators["block_recall"] < indicators["recall"]:
# f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
# indicators["precision"] + indicators["block_recall"])
# else:
# f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"]
indicators['performance'] = performance
print(f'ER Indicators: {indicators}')
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m')
return indicators
def evaluate_prediction(prediction_: pd.DataFrame, labeled_attr: str, predicted_attr: str) -> dict:
new_df = prediction_.reset_index(drop=False, inplace=False)
gold = new_df[labeled_attr]
predicted = new_df[predicted_attr]
gold_negative = gold[gold == 0].index.values
gold_positive = gold[gold == 1].index.values
predicted_negative = predicted[predicted == 0].index.values
predicted_positive = predicted[predicted == 1].index.values
false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
num_true_positives = float(len(true_positive_indices))
num_false_positives = float(len(false_positive_indices))
num_false_negatives = float(len(false_negative_indices))
precision_denominator = num_true_positives + num_false_positives
recall_denominator = num_true_positives + num_false_negatives
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
return {"precision": precision, "recall": recall, "F1": F1}
def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
predictions_attrs = predictions.columns.values.tolist()
col_tuple_list = []
for _ in predictions_attrs:
if _.startswith('ltable'):
left_index = predictions_attrs.index(_)
right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
col_tuple_list.append((left_index, right_index))
length = predictions.shape[0]
width = predictions.shape[1]
predictions = predictions.reset_index(drop=True)
sentences = predictions.values.flatten(order='F').tolist()
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda", batch_size=256, show_progress_bar=True)
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
# prediction的归一化嵌入张量
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor_dict = {}
for col_tuple in col_tuple_list:
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor, decimals=4)
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
def is_explicable(row, all_mds: list, st_dict):
attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段
for md_tuple in all_mds:
explicable = True # 假设这条md能解释当前元组
for a in attrs:
if st_dict[a][row[0]].item() < md_tuple[0][a]:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return md_tuple[2] # 任意一条md能解释直接返回
return -1.0 # 遍历结束,不能解释
def ml_er(config: Configuration, blocking_result_):
indicators = matching(config, blocking_result_)
output_path = er_output_dir + "eval_result.txt"
with open(output_path, 'w') as _f:
_f.write('Precision:' + str(indicators["precision"]) + '\n')
_f.write('Recall:' + str(indicators["recall"]) + '\n')
_f.write('F1:' + str(indicators["F1"]) + '\n')
_f.write('block_recall:' + str(indicators["block_recall"]) + '\n')
_f.write('interpretability:' + str(indicators['interpretability']) + '\n')
_f.write('performance:' + str(indicators['performance']) + '\n')
if __name__ == '__main__':
if os.path.isfile(hpo_output_dir + "incumbent.json"):
with open(hpo_output_dir + "configspace.json", 'r') as f:
dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
with open(hpo_output_dir + "incumbent.json", 'r') as f:
dic = json.load(f)
configuration = ConfigSpace.Configuration(configspace, values=dic)
with open(er_output_dir + "blocking_result.pickle", "rb") as file:
blocking_result = pickle.load(file)
ml_er(configuration, blocking_result)