From 882c25d20fdd7b7fd463ed79f376e3e46c1a90fa Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Sat, 28 Oct 2023 18:09:59 +0800 Subject: [PATCH] =?UTF-8?q?1.=E6=89=80=E6=9C=89=E7=9B=B8=E4=BC=BC=E5=BA=A6?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E4=BE=9D=E9=9D=A0GPU=202.=E8=BF=AD=E4=BB=A3?= =?UTF-8?q?=E8=BD=AE=E6=95=B0=E8=87=AA=E5=8A=A8=E8=AF=86=E5=88=AB=203.?= =?UTF-8?q?=E8=B6=85=E5=8F=82=E6=95=B0=E4=BC=98=E5=8C=96=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E8=90=BD=E7=9B=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + hpo/er_model_hpo.py | 43 ++++-- md_discovery/md_discover.py | 55 ++----- md_discovery/multi_process_infer_by_pairs.py | 18 +-- md_discovery/tmp_discover.py | 106 ++----------- ml_er/ml_entity_resolver.py | 149 +++++++++++-------- settings.py | 3 +- tfile.py | 64 +++++++- 8 files changed, 212 insertions(+), 229 deletions(-) diff --git a/.gitignore b/.gitignore index 6c7c123..a9db128 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /deprecated/ /datasets/ +/ml_er/output/* +/md_discovery/output/* +/hpo/output/* diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py index e44b9d8..e3937cd 100644 --- a/hpo/er_model_hpo.py +++ b/hpo/er_model_hpo.py @@ -1,14 +1,18 @@ import os +import numpy as np +import torch +import json from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer from ConfigSpace.conditions import InCondition +from ConfigSpace.read_and_write import json as csj import py_entitymatching as em import py_entitymatching.catalog.catalog_manager as cm import pandas as pd from smac import HyperparameterOptimizationFacade, Scenario from settings import * -from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable +from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable, build_col_pairs_sim_tensor_dict # 数据在外部加载 ######################################################################################################################## @@ -171,23 +175,23 @@ class Classifier: predictions_attrs.extend(attrs_with_r_prefix) predictions_attrs.extend(['gold', 'predicted']) predictions = predictions[predictions_attrs] + predictions = predictions.reset_index(drop=True) + predictions = predictions.astype(str) + sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) # 默认路径为 "../md_discovery/output/xxx.txt" - # 真阳/假阴 mds/vio 共4个md文件 - md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt', - md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt'] - epl_match = 0 # 可解释,预测match - nepl_mismatch = 0 # 不可解释,预测mismatch + # mds/vio 共2个md文件 + md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt'] md_list = load_mds(md_paths) # 从全局变量中读取所有的md + epl_match = 0 # 可解释,预测match if len(md_list) > 0: for line in predictions.itertuples(): - if is_explicable(line, md_list): - if getattr(line, 'predicted') == 1: - epl_match += 1 - else: - if getattr(line, 'predicted') == 0: - nepl_mismatch += 1 - interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性 + if is_explicable(line, md_list, sim_tensor_dict) and str(getattr(line, 'predicted')) == str(1): + epl_match += 1 + + ppre = predictions[predictions['predicted'] == str(1)] + interpretability = epl_match / len(ppre) # 可解释性 + # todo block_recall可以考虑以下注释 # if indicators["block_recall"] >= 0.8: # f1 = indicators["F1"] # else: @@ -201,10 +205,15 @@ class Classifier: def ml_er_hpo(): classifier = Classifier() + cs = classifier.configspace + str_configspace = csj.write(cs) + dict_configspace = json.loads(str_configspace) + with open(hpo_output_dir + "configspace.json", "w") as f: + json.dump(dict_configspace, f) # Next, we create an object, holding general information about the run scenario = Scenario( - classifier.configspace, + cs, deterministic=True, n_trials=10, # We want to run max 50 trials (combination of config and seed) n_workers=1 @@ -221,9 +230,11 @@ def ml_er_hpo(): ) incumbent = smac.optimize() + incumbent_ndarray = incumbent.get_array() + np.save(hpo_output_dir + 'incumbent.npy', incumbent_ndarray) # Get cost of default configuration - default_cost = smac.validate(classifier.configspace.get_default_configuration()) + default_cost = smac.validate(cs.get_default_configuration()) print(f"Default cost: {default_cost}") # Let's calculate the cost of the incumbent @@ -235,4 +246,4 @@ def ml_er_hpo(): if __name__ == '__main__': - print(1) + ml_er_hpo() diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py index 043bcdd..876513a 100644 --- a/md_discovery/md_discover.py +++ b/md_discovery/md_discover.py @@ -1,5 +1,4 @@ -from md_discovery.multi_process_infer_by_pairs import inference_from_record_pairs -from md_discovery.multi_process_infer_by_pairs import get_mds_metadata +from md_discovery import tmp_discover from settings import * # # 若不输出support和confidence,使用以下两块代码 @@ -18,51 +17,25 @@ from settings import * def md_discover(): # 目前可以仿照这个main函数写 - tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv" - fn_single_tuple_path = er_output_dir + "fn_single_tuple.csv" + t_single_tuple_path = er_output_dir + "t_single_tuple.csv" # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 - # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) - # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 - tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) - fn_mds, fn_vio = inference_from_record_pairs(fn_single_tuple_path, similarity_threshold, target_attr) + # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值 + mds_list, vio_list = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr) - # 如果不需要输出support和confidence,去掉下面两行 - tp_mds_meta = get_mds_metadata(tp_mds, tp_single_tuple_path, target_attr) - tp_vio_meta = get_mds_metadata(tp_vio, tp_single_tuple_path, target_attr) - - fn_mds_meta = get_mds_metadata(fn_mds, fn_single_tuple_path, target_attr) - fn_vio_meta = get_mds_metadata(fn_vio, fn_single_tuple_path, target_attr) - - # 若输出support和confidence,使用以下两块代码 # 将列表1写入本地,路径需自己修改 - tp_mds_path = md_output_dir + "tp_mds.txt" - tp_vio_path = md_output_dir + "tp_vio.txt" + mds_path = md_output_dir + "mds.txt" + vio_path = md_output_dir + "vio.txt" - with open(tp_mds_path, 'w') as f: - for _ in tp_mds_meta: - for i in _.keys(): - f.write(i + ':' + str(_[i]) + '\t') - f.write('\n') - - with open(tp_vio_path, 'w') as f: - for _ in tp_vio_meta: - for i in _.keys(): - f.write(i + ':' + str(_[i]) + '\t') - f.write('\n') - - fn_mds_path = md_output_dir + "fn_mds.txt" - fn_vio_path = md_output_dir + "fn_vio.txt" - - with open(fn_mds_path, 'w') as f: - for _ in fn_mds_meta: - for i in _.keys(): - f.write(i + ':' + str(_[i]) + '\t') + with open(mds_path, 'w') as f: + for _ in mds_list: + f.write('Target:'+str(target_attr) + '\t') + f.write(str(_)) f.write('\n') - with open(fn_vio_path, 'w') as f: - for _ in fn_vio_meta: - for i in _.keys(): - f.write(i + ':' + str(_[i]) + '\t') + with open(vio_path, 'w') as f: + for _ in vio_list: + f.write('Target:'+str(target_attr) + '\t') + f.write(str(_)) f.write('\n') diff --git a/md_discovery/multi_process_infer_by_pairs.py b/md_discovery/multi_process_infer_by_pairs.py index d89e51f..319c24d 100644 --- a/md_discovery/multi_process_infer_by_pairs.py +++ b/md_discovery/multi_process_infer_by_pairs.py @@ -7,7 +7,7 @@ import time import torch from tqdm import tqdm from transformers import AutoTokenizer, AutoModel -from settings import model, embedding_dict, er_output_dir +from settings import model, er_output_dir from sentence_transformers.util import cos_sim conf_thresh = 0.8 @@ -91,7 +91,7 @@ def test_load(): # # print(sim.tolist()[0][0]/2 + 0.5) -def if_minimal(md, md_list, target_col): +def is_minimal(md, md_list, target_col): # 假设这个md是minimal minimal = True if len(md_list) == 0: @@ -153,7 +153,7 @@ def inference_from_record_pairs(path, threshold, target_col): # sims是两行的相似度 sims = {} for col in columns: - similarity = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)]) + similarity = norm_cos_sim(getattr(row1, col), getattr(row2, col)) sims[col] = similarity # 寻找violated md,从md列表中删除并加入vio列表 @@ -178,7 +178,7 @@ def inference_from_record_pairs(path, threshold, target_col): # new_rhs = sims[target_col] # spec_r_md = copy.deepcopy(vio_md) # spec_r_md[target_col] = new_rhs - # if if_minimal(spec_r_md, md_list, target_col): + # if is_minimal(spec_r_md, md_list, target_col): # md_list.append(spec_r_md) # 特殊化左侧 @@ -186,11 +186,11 @@ def inference_from_record_pairs(path, threshold, target_col): if sims[col] + 0.01 <= 1: spec_l_md = copy.deepcopy(vio_md) spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 - if if_minimal(spec_l_md, md_list, target_col): + if is_minimal(spec_l_md, md_list, target_col): md_list.append(spec_l_md) # for vio in minimal_vio[:]: - # if not if_minimal(vio, md_list, target_col): + # if not is_minimal(vio, md_list, target_col): # minimal_vio.remove(vio) # fuck = len(minimal_vio) @@ -216,11 +216,11 @@ def inference_from_record_pairs(path, threshold, target_col): # minimal_vio = list(proxy_minimal_vio) # # for _ in minimal_vio[:]: - # if not if_minimal(_, minimal_vio, target_col): + # if not is_minimal(_, minimal_vio, target_col): # minimal_vio.remove(_) # for _ in md_list[:]: - if not if_minimal(_, md_list, target_col): + if not is_minimal(_, md_list, target_col): md_list.remove(_) return md_list, minimal_vio @@ -258,7 +258,7 @@ def get_one_md_metadata(md, dataframe, target_col): left_satisfy = True both_satisfy = True for col in columns: - sim = norm_cos_sim(embedding_dict[getattr(row1, col)], embedding_dict[getattr(row2, col)]) + sim = norm_cos_sim(getattr(row1, col), getattr(row2, col)) if col == target_col: if sim + 0.0000001 < 1: both_satisfy = False diff --git a/md_discovery/tmp_discover.py b/md_discovery/tmp_discover.py index b75ee07..4465e6e 100644 --- a/md_discovery/tmp_discover.py +++ b/md_discovery/tmp_discover.py @@ -4,27 +4,18 @@ from concurrent.futures import ProcessPoolExecutor from multiprocessing.managers import SharedMemoryManager import numpy as np -import pandas import pandas as pd -import Levenshtein import copy import torch from tqdm import tqdm -from md_discovery.multi_process_infer_by_pairs import norm_cos_sim -from settings import embedding_dict, model +from settings import model, md_output_dir conf_thresh = 0.8 -def my_Levenshtein_ratio(str1, str2): - if max(len(str1), len(str2)) == 0: - return 1 - return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) - - -def if_minimal(md, md_list, target_col): +def is_minimal(md, md_list, target_col): # 假设这个md是minimal if len(md_list) == 0: return True @@ -49,23 +40,7 @@ def if_minimal(md, md_list, target_col): return minimal -def remove_by_confidence(md, md_list, relation, sim_tensor, target_col, lock): - support, confidence = get_one_md_metadata(md, relation, sim_tensor, target_col) - if confidence < 0.8: - with lock: - md_list.remove(md) - - -# def remove_by_confidence(md, l, relation, target_col): -# boolean, conf = satisfy_confidence(md, relation, 0.8, target_col) -# if not boolean: -# l.remove(md) -# print(md, '\t', conf) - -# def build_sim_matrix(): -# width -# return 0 -def inference_from_record_pairs(path, threshold, target_col): +def pairs_inference(path, threshold, target_col): data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data.fillna("", inplace=True) data = data.astype(str) @@ -87,7 +62,7 @@ def inference_from_record_pairs(path, threshold, target_col): sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) sim_tensor = sim_tensor/2 + 0.5 - torch.save(sim_tensor, "E:\\Data\\Research\\Projects\\matching_dependency\\tensor.pt") + torch.save(sim_tensor, md_output_dir + "tensor.pt") md_list = [] minimal_vio = [] @@ -108,11 +83,10 @@ def inference_from_record_pairs(path, threshold, target_col): sims[col] = similarity # 寻找violated md,从md列表中删除并加入vio列表 - # tmp_md_list = copy.deepcopy(md_list) for md in md_list[:]: lhs_satis = True rhs_satis = True - for col in list(set(columns) - {target_col}): + for col in cols_but_target: if sims[col] < md[col]: lhs_satis = False break @@ -123,32 +97,23 @@ def inference_from_record_pairs(path, threshold, target_col): violated_mds.append(md) for vio_md in violated_mds: - # 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值 - # if sims[target_col] >= threshold: - # new_rhs = sims[target_col] - # spec_r_md = copy.deepcopy(vio_md) - # spec_r_md[target_col] = new_rhs - # if if_minimal(spec_r_md, md_list, target_col): - # md_list.append(spec_r_md) # 特殊化左侧 - for col in list(set(columns) - {target_col}): + for col in cols_but_target: if sims[col] + 0.01 <= 1: spec_l_md = copy.deepcopy(vio_md) spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 - if if_minimal(spec_l_md, md_list, target_col): + if is_minimal(spec_l_md, md_list, target_col): md_list.append(spec_l_md) if vio_md not in minimal_vio: minimal_vio.append(vio_md) if len(md_list) == 0: terminate = True break - - # tmp_minimal_vio = copy.deepcopy(minimal_vio) if terminate: break if len(md_list) > 0: for vio in minimal_vio[:]: - if not if_minimal(vio, md_list, target_col): + if not is_minimal(vio, md_list, target_col): minimal_vio.remove(vio) print('mds_list\t', len(md_list), '\n') @@ -157,64 +122,23 @@ def inference_from_record_pairs(path, threshold, target_col): if len(minimal_vio) == 0: return md_list, [] - # manager = multiprocessing.Manager() - # lock = manager.Lock() - # pool_size = 4 - # pool = multiprocessing.Pool(pool_size) - # with manager: - # proxy_minimal_vio = manager.list(minimal_vio) - # for _ in minimal_vio[:]: - # pool.apply_async(remove_by_confidence, args=(_, proxy_minimal_vio, data, sim_tensor, target_col, lock)) - # pool.close() - # pool.join() - # minimal_vio = list(proxy_minimal_vio) - - # minimal_vio.reverse() - i = 0 remove_list = [] - fuck = [] + # fuck = [] for md in minimal_vio: support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) - fuck.append((support, confidence)) + # fuck.append((support, confidence)) if support < 1: print('delete by support') remove_list.append(md) - if confidence < 0.8: + if confidence < 0.5: print('delete by confidence') remove_list.append(md) - fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True) - # while i < len(minimal_vio): - # print('vio_index\t', i) - # print('vio_length', len(minimal_vio)) - # current_md = minimal_vio[i] - # support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index) - # # if support < 50: - # # minimal_vio_length = len(minimal_vio) - # # j = i + 1 - # # while j < len(minimal_vio): - # # specialization = True - # # next_md = minimal_vio[j] - # # for col in cols_but_target: - # # if current_md[col] > next_md[col]: - # # specialization = False - # # break - # # if specialization: - # # minimal_vio.remove(next_md) - # # else: - # # j += 1 - # # print('sup') - # # minimal_vio.remove(current_md) - # if support < 1: - # print('delete by support') - # minimal_vio.remove(current_md) - # if confidence < 0.8: - # print('delete by confidence') - # minimal_vio.remove(current_md) - # if support >= 1 and confidence >= 0.8: - # i += 1 + # fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True) + for _ in remove_list: + minimal_vio.remove(_) for _ in minimal_vio[:]: - if not if_minimal(_, minimal_vio, target_col): + if not is_minimal(_, minimal_vio, target_col): minimal_vio.remove(_) print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m') diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py index 1ac0e5e..c5427f3 100644 --- a/ml_er/ml_entity_resolver.py +++ b/ml_er/ml_entity_resolver.py @@ -1,33 +1,35 @@ +import json import os import sys +import ConfigSpace +import pandas +import torch from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric - +from ConfigSpace.read_and_write import json as csj import py_entitymatching as em import py_entitymatching.catalog.catalog_manager as cm import pandas as pd import six from ConfigSpace import Configuration -from md_discovery.multi_process_infer_by_pairs import my_Levenshtein_ratio, norm_cos_sim from settings import * def process_prediction_for_md_discovery(pred: pd.DataFrame, - tp_single_tuple_path: str = er_output_dir + "tp_single_tuple.csv", - fn_single_tuple_path: str = er_output_dir + "fn_single_tuple.csv"): + t_single_tuple_path: str = er_output_dir + "t_single_tuple.csv"): # 提取预测表中真阳和假阴部分 tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)] fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)] + # 拼成一张表 + df = pd.concat([tp, fn]) # 将真阳/假阴表中左右ID调整一致 - for index, row in tp.iterrows(): - tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] - for index, row in fn.iterrows(): - fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] + for index, row in df.iterrows(): + df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] pred_columns = pred.columns.values.tolist() l_columns = [] r_columns = [] - columns = [] + cols = [] # 将预测表中左表和右表字段名分别加入两个列表 for _ in pred_columns: if _.startswith('ltable'): @@ -36,25 +38,15 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame, r_columns.append(_) # 将左表中字段名去掉前缀,作为统一的字段名列表(前提是两张表内对应字段名调整一致) for _ in l_columns: - columns.append(_.replace('ltable_', '')) - - # 将表拆分成左右两部分 - tpl = tp[l_columns] - tpr = tp[r_columns] - # 将左右两部分字段名统一 - tpl.columns = columns - tpr.columns = columns + cols.append(_.replace('ltable_', '')) - fnl = fn[l_columns] - fnr = fn[r_columns] - fnl.columns = columns - fnr.columns = columns + ldf = df[l_columns] + rdf = df[r_columns] + ldf.columns = cols + rdf.columns = cols + t_single_tuple = pd.concat([ldf, rdf]) - tp_single_tuple = pd.concat([tpl, tpr]) - fn_single_tuple = pd.concat([fnl, fnr]) - - tp_single_tuple.to_csv(tp_single_tuple_path, sep=',', index=False, header=True) - fn_single_tuple.to_csv(fn_single_tuple_path, sep=',', index=False, header=True) + t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True) def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, @@ -100,43 +92,57 @@ def load_mds(paths: list) -> list: # 读取每一行的md,加入该文件的md列表 for line in f.readlines(): md_metadata = line.strip().split('\t') - md = eval(md_metadata[0].replace('md:', '')) - confidence = eval(md_metadata[2].replace('confidence:', '')) - if confidence > 0: - mds.append(md) + # todo 如果MD文件的形式改了 这里也要改 + md = eval(md_metadata[1]) + mds.append(md) all_mds.extend(mds) return all_mds -def is_explicable(row, all_mds: list) -> bool: +def is_explicable(row, all_mds: list, st_dict) -> bool: attrs = all_mds[0].keys() # 从第一条md中读取所有字段 for md in all_mds: explicable = True # 假设这条md能解释当前元组 for a in attrs: - threshold = md[a] - if norm_cos_sim(embedding_dict[str(getattr(row, 'ltable_'+a))], - embedding_dict[str(getattr(row, 'rtable_'+a))]) < threshold: - explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 - break # 不再与当前md的其他相似度阈值比较,跳转到下一条md + if a != target_attr: + if st_dict[a][row[0]].item() < md[a]: + explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 + break # 不再与当前md的其他相似度阈值比较,跳转到下一条md if explicable: return True # 任意一条md能解释,直接返回 return False # 遍历结束,不能解释 -def load_data(left_path: str, right_path: str, mapping_path: str): - left = pd.read_csv(left_path, encoding='ISO-8859-1') - cm.set_key(left, left.columns.values.tolist()[0]) - left.fillna("", inplace=True) - left = left.astype(str) - - right = pd.read_csv(right_path, encoding='ISO-8859-1') - cm.set_key(right, right.columns.values.tolist()[0]) - right.fillna("", inplace=True) - right = right.astype(str) - - mapping = pd.read_csv(mapping_path) - mapping = mapping.astype(str) - return left, right, mapping +def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame): + predictions_attrs = predictions.columns.values.tolist() + col_tuple_list = [] + for _ in predictions_attrs: + if _.startswith('ltable'): + left_index = predictions_attrs.index(_) + right_index = predictions_attrs.index(_.replace('ltable_', 'rtable_')) + col_tuple_list.append((left_index, right_index)) + + length = predictions.shape[0] + width = predictions.shape[1] + sentences = [] + for col in range(0, width): + for row in range(0, length): + cell_value = predictions.values[row, col] + sentences.append(cell_value) + embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") + split_embedding = torch.split(embedding, length, dim=0) + table_tensor = torch.stack(split_embedding, dim=0, out=None) + # prediction的归一化嵌入张量 + norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) + sim_tensor_dict = {} + for col_tuple in col_tuple_list: + lattr_tensor = norm_table_tensor[col_tuple[0]] + rattr_tensor = norm_table_tensor[col_tuple[1]] + mul_tensor = lattr_tensor * rattr_tensor + sim_tensor = torch.sum(mul_tensor, 1) + sim_tensor = sim_tensor / 2 + 0.5 + sim_tensor_dict[predictions_attrs[col_tuple[0]].replace('ltable_', '')] = sim_tensor + return sim_tensor_dict def ml_er(iter_round: int, config: Configuration = None, ): @@ -277,30 +283,27 @@ def ml_er(iter_round: int, config: Configuration = None, ): predictions_attrs.extend(attrs_with_r_prefix) predictions_attrs.extend(['gold', 'predicted']) predictions = predictions[predictions_attrs] + process_prediction_for_md_discovery(predictions) + predictions = predictions.reset_index(drop=True) + predictions = predictions.astype(str) + sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) - md_paths = [md_output_dir + 'tp_mds.txt', md_output_dir + 'tp_vio.txt', - md_output_dir + 'fn_mds.txt', md_output_dir + 'fn_vio.txt'] - epl_match = 0 # 可解释,预测match - nepl_mismatch = 0 # 不可解释,预测mismatch - + md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt'] md_list = load_mds(md_paths) # 从全局变量中读取所有的md + epl_match = 0 # 可解释,预测match if len(md_list) > 0: for row in predictions.itertuples(): - if is_explicable(row, md_list): - if getattr(row, 'predicted') == 1: - epl_match += 1 - else: - if getattr(row, 'predicted') == 0: - nepl_mismatch += 1 - - interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性 + if is_explicable(row, md_list, sim_tensor_dict) and str(getattr(row, 'predicted')) == str(1): + epl_match += 1 + + df = predictions[predictions['predicted'] == str(1)] + interpretability = epl_match / len(df) # 可解释性 if indicators["block_recall"] >= 0.8: f1 = indicators["F1"] else: f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (indicators["precision"] + indicators["block_recall"]) performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 ################################################################################################################ - process_prediction_for_md_discovery(predictions) output_path = er_output_dir + "eval_result_" + str(iter_round) + ".txt" with open(output_path, 'w') as f: @@ -313,4 +316,20 @@ def ml_er(iter_round: int, config: Configuration = None, ): if __name__ == '__main__': - ml_er(1) + iterations = 1 + filename_list = os.listdir(er_output_dir) + if len(filename_list) > 0: + for _ in filename_list: + if _.startswith('eval_result'): + iterations = int(_[12:13]) + 1 + + if iterations > 1: + incumbent_array = np.load(hpo_output_dir + 'incumbent.npy') + with open(hpo_output_dir + "configspace.json", 'r') as f: + dict_configspace = json.load(f) + str_configspace = json.dumps(dict_configspace) + configspace = csj.read(str_configspace) + configuration = ConfigSpace.Configuration(configspace, vector=incumbent_array) + ml_er(iterations, configuration) + else: + ml_er(1) diff --git a/settings.py b/settings.py index fbcabe8..a275b11 100644 --- a/settings.py +++ b/settings.py @@ -16,6 +16,5 @@ confidence_threshold = 0.8 interpre_weight = 0.3 # 可解释性权重 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' +hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\' model = SentenceTransformer('E:\\Data\\Research\\Models\\paraphrase-MiniLM-L6-v2') -embedding_dict = np.load('E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\embedding_dic.npy', - allow_pickle=True).item() diff --git a/tfile.py b/tfile.py index d85cc0c..f6b65a6 100644 --- a/tfile.py +++ b/tfile.py @@ -1,13 +1,16 @@ +import json import multiprocessing +import os import time + +import ConfigSpace import numpy as np import pandas as pd import torch from tqdm import tqdm - -from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs +from ConfigSpace.read_and_write import json as csj from md_discovery import tmp_discover -from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict +from settings import er_output_dir, similarity_threshold, target_attr, hpo_output_dir def fuck(i): @@ -64,9 +67,60 @@ def test4(): print(torch.count_nonzero(one_bool_tensor).item()) +def test5(): + ten1 = torch.tensor([[1, 2, 3], + [7, 8, 9]]) + ten2 = torch.tensor([[4, 5, 6], + [11, 12, 15]]) + result = ten1 * ten2 + r = torch.sum(result, 1) + print('\n') + print(result) + print(r) + + +def test6(): + table_tensor = torch.tensor([[[1., 2., 3.], + [4., 5., 6.], + [7., 8., 9.]], + [[1., 2., 3.], + [4., 5., 6.], + [7., 8., 9.]]]) + t = torch.tensor([[1., 2., 3.], + [4., 5., 6.]]) + norm1 = torch.nn.functional.normalize(table_tensor, dim=1) + norm2 = torch.nn.functional.normalize(table_tensor, dim=2) + print('\n') + print(norm1) + print(norm2) + print(t.shape) + + +def test7(): + iterations = 1 + filename_list = os.listdir(er_output_dir) + if len(filename_list) > 0: + for _ in filename_list: + if _.startswith('eval_result'): + iterations = int(_[12:13]) + 1 + print(iterations) + + +def test8(): + cum = np.load(hpo_output_dir + 'incumbent.npy') + with open(hpo_output_dir + "configspace.json", 'r') as load_f: + dict_configspace = json.load(load_f) + str_configspace = json.dumps(dict_configspace) + configspace = csj.read(str_configspace) + config = ConfigSpace.Configuration(configspace, vector=cum) + print(cum) + + if __name__ == '__main__': start = time.time() - tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv" + t_single_tuple_path = er_output_dir + "t_single_tuple.csv" # tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) - tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) + tp_mds, tp_vio = tmp_discover.pairs_inference(t_single_tuple_path, similarity_threshold, target_attr) print(time.time() - start) + +