diff --git a/hpo/ditto_hpo.py b/hpo/ditto_hpo.py index 5f9f150..ee2876e 100644 --- a/hpo/ditto_hpo.py +++ b/hpo/ditto_hpo.py @@ -1,6 +1,7 @@ import sys sys.path.append('/root/hjt/md_bayesian_er_ditto/') - +import os +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" import json import time from colorama import init, Fore diff --git a/md_discovery/md_mining.py b/md_discovery/md_mining.py index 3f5df74..5c50efe 100644 --- a/md_discovery/md_mining.py +++ b/md_discovery/md_mining.py @@ -1,3 +1,6 @@ +import sys +sys.path.append('/root/hjt/md_bayesian_er_ditto/') + import itertools import pickle import random @@ -12,8 +15,7 @@ import torch.nn.functional from tqdm import tqdm from setting import * -import sys -sys.path.append('/root/hjt/md_bayesian_er_ditto/') + # note 对表进行嵌入时定位了有空值的cell, 计算相似度时有空值则置为-1.0000 diff --git a/ml_er/ditto_er.py b/ml_er/ditto_er.py index c8755a9..a8619f2 100644 --- a/ml_er/ditto_er.py +++ b/ml_er/ditto_er.py @@ -1,5 +1,6 @@ import os import sys +import time os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" sys.path.append('/root/hjt/md_bayesian_er_ditto/') @@ -8,7 +9,9 @@ import pickle import torch import json import numpy as np +import pandas as pd import random +from tqdm import tqdm from setting import * from colorama import Fore from argparse import Namespace @@ -50,11 +53,12 @@ def matching(hpo_config): hp.batch_size = hpo_config['batch_size'] hp.max_len = hpo_config['max_len'] hp.lr = 3e-5 - hp.n_epochs = 20 + # hp.n_epochs = 20 + hp.n_epochs = 2 # hp.finetuning hp.save_model = True hp.input_path = config['testset'] - hp.output_path = '/root/autodl-tmp/output/matched_small.jsonl' + hp.output_path = '/root/autodl-tmp/output/predictions.jsonl' hp.logdir = '/root/autodl-tmp/checkpoints/' hp.checkpoint_path = '/root/autodl-tmp/checkpoints/' @@ -106,7 +110,7 @@ def matching(hpo_config): hp.lm, hp.use_gpu, hp.fp16) summarizer = dk_injector = None - pdb.set_trace() + if hp.summarize: summarizer = Summarizer(config, hp.lm) @@ -120,18 +124,116 @@ def matching(hpo_config): lm=hp.lm, dk_injector=dk_injector, threshold=threshold) - # todo indicators - # write results - # interpretability - indicators = {} + + predictions_raw = pd.read_json(hp.output_path, encoding='ISO-8859-1', lines=True) + predictions = pd.read_csv(directory_path + '/test_whole.csv', encoding='ISO-8859-1') + predictions['predicted'] = predictions_raw['match'] + indicators = evaluate_prediction(predictions, 'label', 'predicted') + predictions.drop(columns='_id', inplace=True) + predictions = predictions.reset_index(drop=True) + predictions = predictions.astype(str) + + sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) + predictions['confidence'] = 0 + predictions['md'] = '' + + epl_match = 0 # 可解释,预测match + if len(md_list) > 0: + for row in tqdm(predictions.itertuples()): + if str(getattr(row, 'predicted')) == str(1): + conf, md_dict = is_explicable(row, md_list, sim_tensor_dict) + if conf > 0: + predictions.loc[row[0], 'confidence'] = conf + predictions.loc[row[0], 'md'] = str(md_dict) + epl_match += 1 + + df = predictions[predictions['predicted'] == str(1)] + interpretability = epl_match / len(df) # 可解释性 + indicators['interpretability'] = interpretability + performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"] + indicators['performance'] = performance + print(Fore.BLUE + f'ER Indicators: {indicators}') + predictions.to_csv(er_output_dir + '/predictions.csv', sep=',', index=False, header=True) + print(Fore.CYAN + f'Finish Time: {time.time()}') return indicators -# todo ml_er function +def evaluate_prediction(prediction_: pd.DataFrame, labeled_attr: str, predicted_attr: str) -> dict: + new_df = prediction_.reset_index(drop=False, inplace=False) + gold = new_df[labeled_attr] + predicted = new_df[predicted_attr] + gold_negative = gold[gold == 0].index.values + gold_positive = gold[gold == 1].index.values + predicted_negative = predicted[predicted == 0].index.values + predicted_positive = predicted[predicted == 1].index.values + + false_positive_indices = list(set(gold_negative).intersection(predicted_positive)) + true_positive_indices = list(set(gold_positive).intersection(predicted_positive)) + false_negative_indices = list(set(gold_positive).intersection(predicted_negative)) + + num_true_positives = float(len(true_positive_indices)) + num_false_positives = float(len(false_positive_indices)) + num_false_negatives = float(len(false_negative_indices)) + + precision_denominator = num_true_positives + num_false_positives + recall_denominator = num_true_positives + num_false_negatives + + precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator + recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator + F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall) + + return {"precision": precision, "recall": recall, "F1": F1} + + +def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame): + predictions_attrs = predictions.columns.values.tolist() + col_tuple_list = [] + for _ in predictions_attrs: + if _.startswith('ltable'): + left_index = predictions_attrs.index(_) + right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_')) + col_tuple_list.append((left_index, right_index)) + + length = predictions.shape[0] + # width = predictions.shape[1] + predictions = predictions.reset_index(drop=True) + sentences = predictions.values.flatten(order='F').tolist() + + embedding = model.encode(sentences, convert_to_tensor=True, device="cuda", batch_size=256, show_progress_bar=True) + split_embedding = torch.split(embedding, length, dim=0) + table_tensor = torch.stack(split_embedding, dim=0, out=None) + # prediction的归一化嵌入张量 + norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) + sim_tensor_dict = {} + for col_tuple in col_tuple_list: + lattr_tensor = norm_table_tensor[col_tuple[0]] + rattr_tensor = norm_table_tensor[col_tuple[1]] + mul_tensor = lattr_tensor * rattr_tensor + sim_tensor = torch.sum(mul_tensor, 1) + sim_tensor = torch.round(sim_tensor * 100) / 100 + sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor + return sim_tensor_dict + + +def is_explicable(row, all_mds: list, st_dict): + attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段 + for md_tuple in all_mds: + explicable = True # 假设这条md能解释当前元组 + for a in attrs: + if st_dict[a][row[0]].item() < md_tuple[0][a]: + explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 + break # 不再与当前md的其他相似度阈值比较,跳转到下一条md + if explicable: + return md_tuple[2], md_tuple[0] # 任意一条md能解释,直接返回 + return -1.0, {} # 遍历结束,不能解释 + + def ml_er(config: Configuration): indicators = matching(config) output_path = er_output_dir + "/eval_result.txt" with open(output_path, 'w') as _f: + _f.write('precision:' + str(indicators['precision']) + '\n') + _f.write('recall:' + str(indicators['recall']) + '\n') _f.write('F1:' + str(indicators["F1"]) + '\n') _f.write('interpretability:' + str(indicators['interpretability']) + '\n') _f.write('performance:' + str(indicators['performance']) + '\n') diff --git a/setting.py b/setting.py index 797df41..b0b3be4 100644 --- a/setting.py +++ b/setting.py @@ -6,7 +6,7 @@ er_output_dir = '/root/hjt/md_bayesian_er_ditto/ml_er/output' md_output_dir = '/root/hjt/md_bayesian_er_ditto/md_discovery/output' hpo_output_dir = '/root/hjt/md_bayesian_er_ditto/hpo/output' -# model = SentenceTransformer('/root/hjt/all-MiniLM-L6-v2') +model = SentenceTransformer('/root/hjt/all-MiniLM-L6-v2') interpre_weight = 0 # 可解释性权重 support_threshold = 1 confidence_threshold = 0.75