完成ditto平台上的编码

master
HuangJintao 5 months ago
parent a317d886f8
commit d46c9d4dbe

@ -1,6 +1,7 @@
import sys import sys
sys.path.append('/root/hjt/md_bayesian_er_ditto/') sys.path.append('/root/hjt/md_bayesian_er_ditto/')
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import json import json
import time import time
from colorama import init, Fore from colorama import init, Fore

@ -1,3 +1,6 @@
import sys
sys.path.append('/root/hjt/md_bayesian_er_ditto/')
import itertools import itertools
import pickle import pickle
import random import random
@ -12,8 +15,7 @@ import torch.nn.functional
from tqdm import tqdm from tqdm import tqdm
from setting import * from setting import *
import sys
sys.path.append('/root/hjt/md_bayesian_er_ditto/')
# note 对表进行嵌入时定位了有空值的cell, 计算相似度时有空值则置为-1.0000 # note 对表进行嵌入时定位了有空值的cell, 计算相似度时有空值则置为-1.0000

@ -1,5 +1,6 @@
import os import os
import sys import sys
import time
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
sys.path.append('/root/hjt/md_bayesian_er_ditto/') sys.path.append('/root/hjt/md_bayesian_er_ditto/')
@ -8,7 +9,9 @@ import pickle
import torch import torch
import json import json
import numpy as np import numpy as np
import pandas as pd
import random import random
from tqdm import tqdm
from setting import * from setting import *
from colorama import Fore from colorama import Fore
from argparse import Namespace from argparse import Namespace
@ -50,11 +53,12 @@ def matching(hpo_config):
hp.batch_size = hpo_config['batch_size'] hp.batch_size = hpo_config['batch_size']
hp.max_len = hpo_config['max_len'] hp.max_len = hpo_config['max_len']
hp.lr = 3e-5 hp.lr = 3e-5
hp.n_epochs = 20 # hp.n_epochs = 20
hp.n_epochs = 2
# hp.finetuning # hp.finetuning
hp.save_model = True hp.save_model = True
hp.input_path = config['testset'] hp.input_path = config['testset']
hp.output_path = '/root/autodl-tmp/output/matched_small.jsonl' hp.output_path = '/root/autodl-tmp/output/predictions.jsonl'
hp.logdir = '/root/autodl-tmp/checkpoints/' hp.logdir = '/root/autodl-tmp/checkpoints/'
hp.checkpoint_path = '/root/autodl-tmp/checkpoints/' hp.checkpoint_path = '/root/autodl-tmp/checkpoints/'
@ -106,7 +110,7 @@ def matching(hpo_config):
hp.lm, hp.use_gpu, hp.fp16) hp.lm, hp.use_gpu, hp.fp16)
summarizer = dk_injector = None summarizer = dk_injector = None
pdb.set_trace()
if hp.summarize: if hp.summarize:
summarizer = Summarizer(config, hp.lm) summarizer = Summarizer(config, hp.lm)
@ -120,18 +124,116 @@ def matching(hpo_config):
lm=hp.lm, lm=hp.lm,
dk_injector=dk_injector, dk_injector=dk_injector,
threshold=threshold) threshold=threshold)
# todo indicators
# write results predictions_raw = pd.read_json(hp.output_path, encoding='ISO-8859-1', lines=True)
# interpretability predictions = pd.read_csv(directory_path + '/test_whole.csv', encoding='ISO-8859-1')
indicators = {} predictions['predicted'] = predictions_raw['match']
indicators = evaluate_prediction(predictions, 'label', 'predicted')
predictions.drop(columns='_id', inplace=True)
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0
predictions['md'] = ''
epl_match = 0 # 可解释预测match
if len(md_list) > 0:
for row in tqdm(predictions.itertuples()):
if str(getattr(row, 'predicted')) == str(1):
conf, md_dict = is_explicable(row, md_list, sim_tensor_dict)
if conf > 0:
predictions.loc[row[0], 'confidence'] = conf
predictions.loc[row[0], 'md'] = str(md_dict)
epl_match += 1
df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability
performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"]
indicators['performance'] = performance
print(Fore.BLUE + f'ER Indicators: {indicators}')
predictions.to_csv(er_output_dir + '/predictions.csv', sep=',', index=False, header=True)
print(Fore.CYAN + f'Finish Time: {time.time()}')
return indicators return indicators
# todo ml_er function def evaluate_prediction(prediction_: pd.DataFrame, labeled_attr: str, predicted_attr: str) -> dict:
new_df = prediction_.reset_index(drop=False, inplace=False)
gold = new_df[labeled_attr]
predicted = new_df[predicted_attr]
gold_negative = gold[gold == 0].index.values
gold_positive = gold[gold == 1].index.values
predicted_negative = predicted[predicted == 0].index.values
predicted_positive = predicted[predicted == 1].index.values
false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
num_true_positives = float(len(true_positive_indices))
num_false_positives = float(len(false_positive_indices))
num_false_negatives = float(len(false_negative_indices))
precision_denominator = num_true_positives + num_false_positives
recall_denominator = num_true_positives + num_false_negatives
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
return {"precision": precision, "recall": recall, "F1": F1}
def build_col_pairs_sim_tensor_dict(predictions: pd.DataFrame):
predictions_attrs = predictions.columns.values.tolist()
col_tuple_list = []
for _ in predictions_attrs:
if _.startswith('ltable'):
left_index = predictions_attrs.index(_)
right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_'))
col_tuple_list.append((left_index, right_index))
length = predictions.shape[0]
# width = predictions.shape[1]
predictions = predictions.reset_index(drop=True)
sentences = predictions.values.flatten(order='F').tolist()
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda", batch_size=256, show_progress_bar=True)
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
# prediction的归一化嵌入张量
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor_dict = {}
for col_tuple in col_tuple_list:
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
sim_tensor = torch.round(sim_tensor * 100) / 100
sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor
return sim_tensor_dict
def is_explicable(row, all_mds: list, st_dict):
attrs = all_mds[0][0].keys() # 从第一条md_tuple中的md字典中读取所有字段
for md_tuple in all_mds:
explicable = True # 假设这条md能解释当前元组
for a in attrs:
if st_dict[a][row[0]].item() < md_tuple[0][a]:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable:
return md_tuple[2], md_tuple[0] # 任意一条md能解释直接返回
return -1.0, {} # 遍历结束,不能解释
def ml_er(config: Configuration): def ml_er(config: Configuration):
indicators = matching(config) indicators = matching(config)
output_path = er_output_dir + "/eval_result.txt" output_path = er_output_dir + "/eval_result.txt"
with open(output_path, 'w') as _f: with open(output_path, 'w') as _f:
_f.write('precision:' + str(indicators['precision']) + '\n')
_f.write('recall:' + str(indicators['recall']) + '\n')
_f.write('F1:' + str(indicators["F1"]) + '\n') _f.write('F1:' + str(indicators["F1"]) + '\n')
_f.write('interpretability:' + str(indicators['interpretability']) + '\n') _f.write('interpretability:' + str(indicators['interpretability']) + '\n')
_f.write('performance:' + str(indicators['performance']) + '\n') _f.write('performance:' + str(indicators['performance']) + '\n')

@ -6,7 +6,7 @@ er_output_dir = '/root/hjt/md_bayesian_er_ditto/ml_er/output'
md_output_dir = '/root/hjt/md_bayesian_er_ditto/md_discovery/output' md_output_dir = '/root/hjt/md_bayesian_er_ditto/md_discovery/output'
hpo_output_dir = '/root/hjt/md_bayesian_er_ditto/hpo/output' hpo_output_dir = '/root/hjt/md_bayesian_er_ditto/hpo/output'
# model = SentenceTransformer('/root/hjt/all-MiniLM-L6-v2') model = SentenceTransformer('/root/hjt/all-MiniLM-L6-v2')
interpre_weight = 0 # 可解释性权重 interpre_weight = 0 # 可解释性权重
support_threshold = 1 support_threshold = 1
confidence_threshold = 0.75 confidence_threshold = 0.75

Loading…
Cancel
Save