From 20c33c0fd8fa7271b88b4407a328a999d4c81d70 Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Thu, 26 Oct 2023 23:00:47 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A9=E7=94=A8GPU=E7=AE=97metrics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 - md_discovery/multi_process_infer_by_pairs.py | 35 +++--- md_discovery/tmp_discover.py | 115 ++++++++++++------- samples_generator.py | 114 ------------------ tfile.py | 84 +++++++++----- 5 files changed, 149 insertions(+), 201 deletions(-) delete mode 100644 README.md delete mode 100644 samples_generator.py diff --git a/README.md b/README.md deleted file mode 100644 index 3c8a8c8..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# matching_dependency - diff --git a/md_discovery/multi_process_infer_by_pairs.py b/md_discovery/multi_process_infer_by_pairs.py index b40970f..d89e51f 100644 --- a/md_discovery/multi_process_infer_by_pairs.py +++ b/md_discovery/multi_process_infer_by_pairs.py @@ -7,7 +7,7 @@ import time import torch from tqdm import tqdm from transformers import AutoTokenizer, AutoModel -from settings import model, embedding_dict +from settings import model, embedding_dict, er_output_dir from sentence_transformers.util import cos_sim conf_thresh = 0.8 @@ -38,12 +38,13 @@ def table_encode(tp_path, fn_path): for col in range(0, tp_width): cell_value = tp_data.values[row, col] tp_sentences.append(cell_value) - tp_embedding = model.encode(tp_sentences, convert_to_tensor=True) + tp_embedding = model.encode(tp_sentences, convert_to_tensor=True, device="cuda") + list_tp_embedding = tp_embedding.tolist() for row in range(0, tp_length): for col in range(0, tp_width): cell_value = tp_data.values[row, col] - embedding_dic[cell_value] = tp_embedding.tolist()[row * tp_width + col] + embedding_dic[cell_value] = list_tp_embedding[row * tp_width + col] fn_data = pd.read_csv(fn_path, low_memory=False, encoding='ISO-8859-1') @@ -57,19 +58,20 @@ def table_encode(tp_path, fn_path): for col in range(0, fn_width): cell_value = fn_data.values[row, col] fn_sentences.append(cell_value) - fn_embedding = model.encode(fn_sentences, convert_to_tensor=True) + fn_embedding = model.encode(fn_sentences, convert_to_tensor=True, device="cuda") + list_fn_embedding = fn_embedding.tolist() for row in range(0, fn_length): for col in range(0, fn_width): cell_value = fn_data.values[row, col] - embedding_dic[cell_value] = fn_embedding.tolist()[row * fn_width + col] + embedding_dic[cell_value] = list_fn_embedding[row * fn_width + col] np.save('embedding_dic.npy', embedding_dic) def test_table_encode(): start = time.time() - table_encode('../ml_er/output/tp_single_tuple.csv', '../ml_er/output/fn_single_tuple.csv') + table_encode(er_output_dir+'tp_single_tuple.csv', er_output_dir+'fn_single_tuple.csv') print(time.time()-start) @@ -133,6 +135,7 @@ def inference_from_record_pairs(path, threshold, target_col): data.fillna("", inplace=True) data = data.astype(str) columns = data.columns.values.tolist() + other_columns = list(set(columns) - {target_col}) md_list = [] minimal_vio = [] @@ -141,7 +144,7 @@ def inference_from_record_pairs(path, threshold, target_col): init_md[col] = 1 if col == target_col else 0 md_list.append(init_md) - for row1 in data.itertuples(): + for row1 in tqdm(data.itertuples()): # 获取当前行的索引,从后一行开始切片 i = row1[0] data1 = data[i + 1:] @@ -158,7 +161,7 @@ def inference_from_record_pairs(path, threshold, target_col): for md in md_list[:]: lhs_satis = True rhs_satis = True - for col in list(set(columns) - {target_col}): + for col in other_columns: if sims[col] + 0.0000001 < md[col]: lhs_satis = False break @@ -167,7 +170,7 @@ def inference_from_record_pairs(path, threshold, target_col): if lhs_satis == True and rhs_satis == False: md_list.remove(md) violated_mds.append(md) - minimal_vio.extend(violated_mds) + # minimal_vio.extend(violated_mds) for vio_md in violated_mds: # 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值 @@ -179,16 +182,16 @@ def inference_from_record_pairs(path, threshold, target_col): # md_list.append(spec_r_md) # 特殊化左侧 - for col in list(set(columns) - {target_col}): + for col in other_columns: if sims[col] + 0.01 <= 1: spec_l_md = copy.deepcopy(vio_md) spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 if if_minimal(spec_l_md, md_list, target_col): md_list.append(spec_l_md) - for vio in minimal_vio[:]: - if not if_minimal(vio, md_list, target_col): - minimal_vio.remove(vio) + # for vio in minimal_vio[:]: + # if not if_minimal(vio, md_list, target_col): + # minimal_vio.remove(vio) # fuck = len(minimal_vio) # tmp = [] @@ -216,9 +219,9 @@ def inference_from_record_pairs(path, threshold, target_col): # if not if_minimal(_, minimal_vio, target_col): # minimal_vio.remove(_) # - # for _ in md_list[:]: - # if not if_minimal(_, md_list, target_col): - # md_list.remove(_) + for _ in md_list[:]: + if not if_minimal(_, md_list, target_col): + md_list.remove(_) return md_list, minimal_vio diff --git a/md_discovery/tmp_discover.py b/md_discovery/tmp_discover.py index 0edf1b9..b75ee07 100644 --- a/md_discovery/tmp_discover.py +++ b/md_discovery/tmp_discover.py @@ -70,6 +70,7 @@ def inference_from_record_pairs(path, threshold, target_col): data.fillna("", inplace=True) data = data.astype(str) columns = data.columns.values.tolist() + target_index = columns.index(target_col) cols_but_target = list(set(columns) - {target_col}) length = data.shape[0] width = data.shape[1] @@ -95,7 +96,6 @@ def inference_from_record_pairs(path, threshold, target_col): init_md[col] = 1 if col == target_col else 0 md_list.append(init_md) - start = time.time() for row1 in range(0, length - 1): terminate = False for row2 in range(row1 + 1, length): @@ -132,9 +132,9 @@ def inference_from_record_pairs(path, threshold, target_col): # md_list.append(spec_r_md) # 特殊化左侧 for col in list(set(columns) - {target_col}): - if sims[col] + 0.05 <= 1: + if sims[col] + 0.01 <= 1: spec_l_md = copy.deepcopy(vio_md) - spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.05 + spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 if if_minimal(spec_l_md, md_list, target_col): md_list.append(spec_l_md) if vio_md not in minimal_vio: @@ -151,9 +151,8 @@ def inference_from_record_pairs(path, threshold, target_col): if not if_minimal(vio, md_list, target_col): minimal_vio.remove(vio) - print(time.time()-start, '\n') - print(len(md_list), '\n') - print(len(minimal_vio), '\n') + print('mds_list\t', len(md_list), '\n') + print('vio_list\t', len(minimal_vio), '\n') if len(minimal_vio) == 0: return md_list, [] @@ -170,49 +169,87 @@ def inference_from_record_pairs(path, threshold, target_col): # pool.join() # minimal_vio = list(proxy_minimal_vio) - start = time.time() - minimal_vio.reverse() + # minimal_vio.reverse() i = 0 - while i < len(minimal_vio): - print(i) - print(len(minimal_vio)) - current_md = minimal_vio[i] - support, confidence = get_one_md_metadata(current_md, data, sim_tensor, target_col) - if support < 50: - minimal_vio_length = len(minimal_vio) - j = i + 1 - while j < len(minimal_vio): - specialization = True - next_md = minimal_vio[j] - for col in cols_but_target: - if current_md[col] > next_md[col]: - specialization = False - break - if specialization: - minimal_vio.remove(next_md) - else: - j += 1 - print('sup') - minimal_vio.remove(current_md) + remove_list = [] + fuck = [] + for md in minimal_vio: + support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) + fuck.append((support, confidence)) + if support < 1: + print('delete by support') + remove_list.append(md) if confidence < 0.8: - print('conf') - minimal_vio.remove(current_md) - if support >= 50 and confidence >= 0.8: - i += 1 - print(time.time()-start) - - + print('delete by confidence') + remove_list.append(md) + fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True) + # while i < len(minimal_vio): + # print('vio_index\t', i) + # print('vio_length', len(minimal_vio)) + # current_md = minimal_vio[i] + # support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index) + # # if support < 50: + # # minimal_vio_length = len(minimal_vio) + # # j = i + 1 + # # while j < len(minimal_vio): + # # specialization = True + # # next_md = minimal_vio[j] + # # for col in cols_but_target: + # # if current_md[col] > next_md[col]: + # # specialization = False + # # break + # # if specialization: + # # minimal_vio.remove(next_md) + # # else: + # # j += 1 + # # print('sup') + # # minimal_vio.remove(current_md) + # if support < 1: + # print('delete by support') + # minimal_vio.remove(current_md) + # if confidence < 0.8: + # print('delete by confidence') + # minimal_vio.remove(current_md) + # if support >= 1 and confidence >= 0.8: + # i += 1 - - t1 = time.time() for _ in minimal_vio[:]: if not if_minimal(_, minimal_vio, target_col): minimal_vio.remove(_) - print(time.time() - t1) + + print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m') return md_list, minimal_vio +def get_metrics(current_md, data, sim_tensor, target_col, target_index): + columns = data.columns.values.tolist() + length = data.shape[0] + width = data.shape[1] + + md_tensor = list(current_md.values()) + md_tensor = torch.tensor(md_tensor, device='cuda') + md_tensor_2d = md_tensor.unsqueeze(1) + md_tensor_3d = md_tensor_2d.unsqueeze(2) + md_tensor_3d = md_tensor_3d.repeat(1, length, length) + + sup_tensor = torch.ge(sim_tensor, md_tensor_3d) + ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') + for i in range(0, width): + if i != target_index: + sup_tensor_slice = sup_tensor[i] + ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) + sup_tensor_int = ini_slice.int() + support = torch.count_nonzero(sup_tensor_int).item() + + ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index]) + conf_tensor_int = ini_slice.int() + confidence_numerator = torch.count_nonzero(conf_tensor_int).item() + confidence = confidence_numerator / support + + return support, confidence + + def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col): data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1') data.fillna("", inplace=True) diff --git a/samples_generator.py b/samples_generator.py deleted file mode 100644 index 0b15a09..0000000 --- a/samples_generator.py +++ /dev/null @@ -1,114 +0,0 @@ -import os -import random - -import pandas as pd -import Levenshtein - -import ml_er.ml_entity_resolver - - -def my_Levenshtein_ratio(str1, str2): - if max(len(str1), len(str2)) == 0: - return 1 - return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2)) - - -def load_mds(paths: list) -> list: - if len(paths) == 0: - return [] - all_mds = [] - # 传入md路径列表 - for md_path in paths: - if not os.path.exists(md_path): - continue - mds = [] - # 打开每一个md文件 - with open(md_path, 'r') as f: - # 读取每一行的md,加入该文件的md列表 - for line in f.readlines(): - md_metadata = line.strip().split('\t') - md = eval(md_metadata[0].replace('md:', '')) - confidence = eval(md_metadata[2].replace('confidence:', '')) - if confidence > 0: - mds.append(md) - all_mds.extend(mds) - return all_mds - - -# 输入: md地址列表/预测表地址/随机生成次数 -# 输出: 一些正样本(带gold列不带prediction列) -def generate_samples(md_path_list, pred_path, count: int): - all_mds = load_mds(md_path_list) - - predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1') - predictions.fillna("", inplace=True) - predictions = predictions.astype(str) - pred_attrs = predictions.columns.values.tolist() # 预测表中的字段,带前缀,包括gold和predict - attrs = [] # 不带前缀的字段,不包括gold和predict - l_attrs = [] - r_attrs = [] - for _ in pred_attrs: - if _.startswith('ltable_'): - attrs.append(_.replace('ltable_', '')) - l_attrs.append(_) - elif _.startswith('rtable'): - r_attrs.append(_) - - fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')] - fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')] - - fpl = fp[l_attrs] - fpr = fp[r_attrs] - # 将左右两部分字段名统一 - fpl.columns = attrs - fpr.columns = attrs - fnl = fn[l_attrs] - fnr = fn[r_attrs] - fnl.columns = attrs - fnr.columns = attrs - fp = pd.concat([fpl, fpr]) - fn = pd.concat([fnl, fnr]) - df = pd.concat([fp, fn]) - length = len(df) - - result = pd.DataFrame() - for i in range(0, count): - dic = {} - for _ in attrs: - if _ == 'id': - index = random.randint(0, length-1) - value = df.iloc[index]['id'] - dic['ltable_'+_] = value - dic['rtable_'+_] = value - else: - index1 = random.randint(0, length-1) - index2 = random.randint(0, length-1) - value1 = df.iloc[index1][_] - value2 = df.iloc[index2][_] - dic['ltable_'+_] = value1 - dic['rtable_'+_] = value2 - - for md in all_mds: - satis = True - for _ in attrs: - if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]: - satis = False - break - if satis: - series = pd.Series(dic) - result = result._append(series, ignore_index=True) - result['gold'] = 1 - return result - - # 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空) - - -if __name__ == '__main__': - md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt', - '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt', - '/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt', - '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt'] - - pre_p = '/home/w/pred.csv' - generate_samples(md_paths, pre_p, 10000) - # 随机生成次数写个一千一万都没问题 diff --git a/tfile.py b/tfile.py index f6005f7..d85cc0c 100644 --- a/tfile.py +++ b/tfile.py @@ -8,8 +8,60 @@ from tqdm import tqdm from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs from md_discovery import tmp_discover from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict + + def fuck(i): - i = i*i+1 + i = i * i + 1 + + +def test1(): + li = [[[6, 6, 2], + [2, 4, 6], + [2, 4, 7], + [3, 6, 4]], + [[6, 2, 7], + [3, 2, 4], + [5, 3, 5], + [6, 2, 4]], + [[7, 2, 2], + [6, 3, 2], + [6, 4, 3], + [6, 5, 6]]] + tensor = torch.Tensor(li) + norm_tensor = torch.nn.functional.normalize(tensor, dim=2) + print(norm_tensor, '\n') + sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2)) + print(sim_ten / 2 + 0.5, '\n') + print(sim_ten.size()) + + +def test2(): + multiprocessing.set_start_method("spawn") + manager = multiprocessing.Manager() + lock = manager.Lock() + pool = multiprocessing.Pool(16) + with manager: + for _ in tqdm(range(0, 1000)): + result = pool.apply_async(fuck, args=(_,)) + print(result) + + +def test3(): + dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627} + ll = list(dic.values()) + ten = torch.Tensor(ll) + t = ten.unsqueeze(1) + t = t.unsqueeze(2) + y = t.repeat(1, 742, 742) + print(ten) + print(y) + print(torch.isfinite(ten)) + print(torch.count_nonzero(y).item()) + + +def test4(): + one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool) + print(torch.count_nonzero(one_bool_tensor).item()) if __name__ == '__main__': @@ -17,32 +69,4 @@ if __name__ == '__main__': tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv" # tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) - print(time.time()-start) - - # li = [[[6, 6, 2], - # [2, 4, 6], - # [2, 4, 7], - # [3, 6, 4]], - # [[6, 2, 7], - # [3, 2, 4], - # [5, 3, 5], - # [6, 2, 4]], - # [[7, 2, 2], - # [6, 3, 2], - # [6, 4, 3], - # [6, 5, 6]]] - # tensor = torch.Tensor(li) - # norm_tensor = torch.nn.functional.normalize(tensor, dim=2) - # print(norm_tensor, '\n') - # sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2)) - # print(sim_ten/2 + 0.5, '\n') - # print(sim_ten.size()) - - # multiprocessing.set_start_method("spawn") - # manager = multiprocessing.Manager() - # lock = manager.Lock() - # pool = multiprocessing.Pool(16) - # with manager: - # for _ in tqdm(range(0, 1000)): - # result = pool.apply_async(fuck, args=(_,)) - # print(result) + print(time.time() - start)