利用GPU算metrics

HuangJintao
HuangJintao 1 year ago
parent 1ccfc9abd6
commit 20c33c0fd8

@ -1,2 +0,0 @@
# matching_dependency

@ -7,7 +7,7 @@ import time
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel from transformers import AutoTokenizer, AutoModel
from settings import model, embedding_dict from settings import model, embedding_dict, er_output_dir
from sentence_transformers.util import cos_sim from sentence_transformers.util import cos_sim
conf_thresh = 0.8 conf_thresh = 0.8
@ -38,12 +38,13 @@ def table_encode(tp_path, fn_path):
for col in range(0, tp_width): for col in range(0, tp_width):
cell_value = tp_data.values[row, col] cell_value = tp_data.values[row, col]
tp_sentences.append(cell_value) tp_sentences.append(cell_value)
tp_embedding = model.encode(tp_sentences, convert_to_tensor=True) tp_embedding = model.encode(tp_sentences, convert_to_tensor=True, device="cuda")
list_tp_embedding = tp_embedding.tolist()
for row in range(0, tp_length): for row in range(0, tp_length):
for col in range(0, tp_width): for col in range(0, tp_width):
cell_value = tp_data.values[row, col] cell_value = tp_data.values[row, col]
embedding_dic[cell_value] = tp_embedding.tolist()[row * tp_width + col] embedding_dic[cell_value] = list_tp_embedding[row * tp_width + col]
fn_data = pd.read_csv(fn_path, low_memory=False, encoding='ISO-8859-1') fn_data = pd.read_csv(fn_path, low_memory=False, encoding='ISO-8859-1')
@ -57,19 +58,20 @@ def table_encode(tp_path, fn_path):
for col in range(0, fn_width): for col in range(0, fn_width):
cell_value = fn_data.values[row, col] cell_value = fn_data.values[row, col]
fn_sentences.append(cell_value) fn_sentences.append(cell_value)
fn_embedding = model.encode(fn_sentences, convert_to_tensor=True) fn_embedding = model.encode(fn_sentences, convert_to_tensor=True, device="cuda")
list_fn_embedding = fn_embedding.tolist()
for row in range(0, fn_length): for row in range(0, fn_length):
for col in range(0, fn_width): for col in range(0, fn_width):
cell_value = fn_data.values[row, col] cell_value = fn_data.values[row, col]
embedding_dic[cell_value] = fn_embedding.tolist()[row * fn_width + col] embedding_dic[cell_value] = list_fn_embedding[row * fn_width + col]
np.save('embedding_dic.npy', embedding_dic) np.save('embedding_dic.npy', embedding_dic)
def test_table_encode(): def test_table_encode():
start = time.time() start = time.time()
table_encode('../ml_er/output/tp_single_tuple.csv', '../ml_er/output/fn_single_tuple.csv') table_encode(er_output_dir+'tp_single_tuple.csv', er_output_dir+'fn_single_tuple.csv')
print(time.time()-start) print(time.time()-start)
@ -133,6 +135,7 @@ def inference_from_record_pairs(path, threshold, target_col):
data.fillna("", inplace=True) data.fillna("", inplace=True)
data = data.astype(str) data = data.astype(str)
columns = data.columns.values.tolist() columns = data.columns.values.tolist()
other_columns = list(set(columns) - {target_col})
md_list = [] md_list = []
minimal_vio = [] minimal_vio = []
@ -141,7 +144,7 @@ def inference_from_record_pairs(path, threshold, target_col):
init_md[col] = 1 if col == target_col else 0 init_md[col] = 1 if col == target_col else 0
md_list.append(init_md) md_list.append(init_md)
for row1 in data.itertuples(): for row1 in tqdm(data.itertuples()):
# 获取当前行的索引,从后一行开始切片 # 获取当前行的索引,从后一行开始切片
i = row1[0] i = row1[0]
data1 = data[i + 1:] data1 = data[i + 1:]
@ -158,7 +161,7 @@ def inference_from_record_pairs(path, threshold, target_col):
for md in md_list[:]: for md in md_list[:]:
lhs_satis = True lhs_satis = True
rhs_satis = True rhs_satis = True
for col in list(set(columns) - {target_col}): for col in other_columns:
if sims[col] + 0.0000001 < md[col]: if sims[col] + 0.0000001 < md[col]:
lhs_satis = False lhs_satis = False
break break
@ -167,7 +170,7 @@ def inference_from_record_pairs(path, threshold, target_col):
if lhs_satis == True and rhs_satis == False: if lhs_satis == True and rhs_satis == False:
md_list.remove(md) md_list.remove(md)
violated_mds.append(md) violated_mds.append(md)
minimal_vio.extend(violated_mds) # minimal_vio.extend(violated_mds)
for vio_md in violated_mds: for vio_md in violated_mds:
# 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值 # 特殊化右侧,我们需要右侧百分百相似,其实不需要降低右侧阈值
@ -179,16 +182,16 @@ def inference_from_record_pairs(path, threshold, target_col):
# md_list.append(spec_r_md) # md_list.append(spec_r_md)
# 特殊化左侧 # 特殊化左侧
for col in list(set(columns) - {target_col}): for col in other_columns:
if sims[col] + 0.01 <= 1: if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md) spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if if_minimal(spec_l_md, md_list, target_col): if if_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md) md_list.append(spec_l_md)
for vio in minimal_vio[:]: # for vio in minimal_vio[:]:
if not if_minimal(vio, md_list, target_col): # if not if_minimal(vio, md_list, target_col):
minimal_vio.remove(vio) # minimal_vio.remove(vio)
# fuck = len(minimal_vio) # fuck = len(minimal_vio)
# tmp = [] # tmp = []
@ -216,9 +219,9 @@ def inference_from_record_pairs(path, threshold, target_col):
# if not if_minimal(_, minimal_vio, target_col): # if not if_minimal(_, minimal_vio, target_col):
# minimal_vio.remove(_) # minimal_vio.remove(_)
# #
# for _ in md_list[:]: for _ in md_list[:]:
# if not if_minimal(_, md_list, target_col): if not if_minimal(_, md_list, target_col):
# md_list.remove(_) md_list.remove(_)
return md_list, minimal_vio return md_list, minimal_vio

@ -70,6 +70,7 @@ def inference_from_record_pairs(path, threshold, target_col):
data.fillna("", inplace=True) data.fillna("", inplace=True)
data = data.astype(str) data = data.astype(str)
columns = data.columns.values.tolist() columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col}) cols_but_target = list(set(columns) - {target_col})
length = data.shape[0] length = data.shape[0]
width = data.shape[1] width = data.shape[1]
@ -95,7 +96,6 @@ def inference_from_record_pairs(path, threshold, target_col):
init_md[col] = 1 if col == target_col else 0 init_md[col] = 1 if col == target_col else 0
md_list.append(init_md) md_list.append(init_md)
start = time.time()
for row1 in range(0, length - 1): for row1 in range(0, length - 1):
terminate = False terminate = False
for row2 in range(row1 + 1, length): for row2 in range(row1 + 1, length):
@ -132,9 +132,9 @@ def inference_from_record_pairs(path, threshold, target_col):
# md_list.append(spec_r_md) # md_list.append(spec_r_md)
# 特殊化左侧 # 特殊化左侧
for col in list(set(columns) - {target_col}): for col in list(set(columns) - {target_col}):
if sims[col] + 0.05 <= 1: if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md) spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.05 spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if if_minimal(spec_l_md, md_list, target_col): if if_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md) md_list.append(spec_l_md)
if vio_md not in minimal_vio: if vio_md not in minimal_vio:
@ -151,9 +151,8 @@ def inference_from_record_pairs(path, threshold, target_col):
if not if_minimal(vio, md_list, target_col): if not if_minimal(vio, md_list, target_col):
minimal_vio.remove(vio) minimal_vio.remove(vio)
print(time.time()-start, '\n') print('mds_list\t', len(md_list), '\n')
print(len(md_list), '\n') print('vio_list\t', len(minimal_vio), '\n')
print(len(minimal_vio), '\n')
if len(minimal_vio) == 0: if len(minimal_vio) == 0:
return md_list, [] return md_list, []
@ -170,49 +169,87 @@ def inference_from_record_pairs(path, threshold, target_col):
# pool.join() # pool.join()
# minimal_vio = list(proxy_minimal_vio) # minimal_vio = list(proxy_minimal_vio)
start = time.time() # minimal_vio.reverse()
minimal_vio.reverse()
i = 0 i = 0
while i < len(minimal_vio): remove_list = []
print(i) fuck = []
print(len(minimal_vio)) for md in minimal_vio:
current_md = minimal_vio[i] support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
support, confidence = get_one_md_metadata(current_md, data, sim_tensor, target_col) fuck.append((support, confidence))
if support < 50: if support < 1:
minimal_vio_length = len(minimal_vio) print('delete by support')
j = i + 1 remove_list.append(md)
while j < len(minimal_vio):
specialization = True
next_md = minimal_vio[j]
for col in cols_but_target:
if current_md[col] > next_md[col]:
specialization = False
break
if specialization:
minimal_vio.remove(next_md)
else:
j += 1
print('sup')
minimal_vio.remove(current_md)
if confidence < 0.8: if confidence < 0.8:
print('conf') print('delete by confidence')
minimal_vio.remove(current_md) remove_list.append(md)
if support >= 50 and confidence >= 0.8: fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
i += 1 # while i < len(minimal_vio):
print(time.time()-start) # print('vio_index\t', i)
# print('vio_length', len(minimal_vio))
# current_md = minimal_vio[i]
# support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index)
# # if support < 50:
# # minimal_vio_length = len(minimal_vio)
# # j = i + 1
# # while j < len(minimal_vio):
# # specialization = True
# # next_md = minimal_vio[j]
# # for col in cols_but_target:
# # if current_md[col] > next_md[col]:
# # specialization = False
# # break
# # if specialization:
# # minimal_vio.remove(next_md)
# # else:
# # j += 1
# # print('sup')
# # minimal_vio.remove(current_md)
# if support < 1:
# print('delete by support')
# minimal_vio.remove(current_md)
# if confidence < 0.8:
# print('delete by confidence')
# minimal_vio.remove(current_md)
# if support >= 1 and confidence >= 0.8:
# i += 1
t1 = time.time()
for _ in minimal_vio[:]: for _ in minimal_vio[:]:
if not if_minimal(_, minimal_vio, target_col): if not if_minimal(_, minimal_vio, target_col):
minimal_vio.remove(_) minimal_vio.remove(_)
print(time.time() - t1)
print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
return md_list, minimal_vio return md_list, minimal_vio
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
columns = data.columns.values.tolist()
length = data.shape[0]
width = data.shape[1]
md_tensor = list(current_md.values())
md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support = torch.count_nonzero(sup_tensor_int).item()
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = ini_slice.int()
confidence_numerator = torch.count_nonzero(conf_tensor_int).item()
confidence = confidence_numerator / support
return support, confidence
def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col): def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1') data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True) data.fillna("", inplace=True)

@ -1,114 +0,0 @@
import os
import random
import pandas as pd
import Levenshtein
import ml_er.ml_entity_resolver
def my_Levenshtein_ratio(str1, str2):
if max(len(str1), len(str2)) == 0:
return 1
return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
def load_mds(paths: list) -> list:
if len(paths) == 0:
return []
all_mds = []
# 传入md路径列表
for md_path in paths:
if not os.path.exists(md_path):
continue
mds = []
# 打开每一个md文件
with open(md_path, 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
md_metadata = line.strip().split('\t')
md = eval(md_metadata[0].replace('md:', ''))
confidence = eval(md_metadata[2].replace('confidence:', ''))
if confidence > 0:
mds.append(md)
all_mds.extend(mds)
return all_mds
# 输入: md地址列表/预测表地址/随机生成次数
# 输出: 一些正样本(带gold列不带prediction列)
def generate_samples(md_path_list, pred_path, count: int):
all_mds = load_mds(md_path_list)
predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1')
predictions.fillna("", inplace=True)
predictions = predictions.astype(str)
pred_attrs = predictions.columns.values.tolist() # 预测表中的字段,带前缀,包括gold和predict
attrs = [] # 不带前缀的字段,不包括gold和predict
l_attrs = []
r_attrs = []
for _ in pred_attrs:
if _.startswith('ltable_'):
attrs.append(_.replace('ltable_', ''))
l_attrs.append(_)
elif _.startswith('rtable'):
r_attrs.append(_)
fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')]
fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')]
fpl = fp[l_attrs]
fpr = fp[r_attrs]
# 将左右两部分字段名统一
fpl.columns = attrs
fpr.columns = attrs
fnl = fn[l_attrs]
fnr = fn[r_attrs]
fnl.columns = attrs
fnr.columns = attrs
fp = pd.concat([fpl, fpr])
fn = pd.concat([fnl, fnr])
df = pd.concat([fp, fn])
length = len(df)
result = pd.DataFrame()
for i in range(0, count):
dic = {}
for _ in attrs:
if _ == 'id':
index = random.randint(0, length-1)
value = df.iloc[index]['id']
dic['ltable_'+_] = value
dic['rtable_'+_] = value
else:
index1 = random.randint(0, length-1)
index2 = random.randint(0, length-1)
value1 = df.iloc[index1][_]
value2 = df.iloc[index2][_]
dic['ltable_'+_] = value1
dic['rtable_'+_] = value2
for md in all_mds:
satis = True
for _ in attrs:
if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]:
satis = False
break
if satis:
series = pd.Series(dic)
result = result._append(series, ignore_index=True)
result['gold'] = 1
return result
# 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空)
if __name__ == '__main__':
md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt',
'/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt',
'/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt',
'/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt']
pre_p = '/home/w/pred.csv'
generate_samples(md_paths, pre_p, 10000)
# 随机生成次数写个一千一万都没问题

@ -8,8 +8,60 @@ from tqdm import tqdm
from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs
from md_discovery import tmp_discover from md_discovery import tmp_discover
from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict
def fuck(i): def fuck(i):
i = i*i+1 i = i * i + 1
def test1():
li = [[[6, 6, 2],
[2, 4, 6],
[2, 4, 7],
[3, 6, 4]],
[[6, 2, 7],
[3, 2, 4],
[5, 3, 5],
[6, 2, 4]],
[[7, 2, 2],
[6, 3, 2],
[6, 4, 3],
[6, 5, 6]]]
tensor = torch.Tensor(li)
norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
print(norm_tensor, '\n')
sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
print(sim_ten / 2 + 0.5, '\n')
print(sim_ten.size())
def test2():
multiprocessing.set_start_method("spawn")
manager = multiprocessing.Manager()
lock = manager.Lock()
pool = multiprocessing.Pool(16)
with manager:
for _ in tqdm(range(0, 1000)):
result = pool.apply_async(fuck, args=(_,))
print(result)
def test3():
dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
ll = list(dic.values())
ten = torch.Tensor(ll)
t = ten.unsqueeze(1)
t = t.unsqueeze(2)
y = t.repeat(1, 742, 742)
print(ten)
print(y)
print(torch.isfinite(ten))
print(torch.count_nonzero(y).item())
def test4():
one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
print(torch.count_nonzero(one_bool_tensor).item())
if __name__ == '__main__': if __name__ == '__main__':
@ -17,32 +69,4 @@ if __name__ == '__main__':
tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv" tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv"
# tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) # tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr) tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
print(time.time()-start) print(time.time() - start)
# li = [[[6, 6, 2],
# [2, 4, 6],
# [2, 4, 7],
# [3, 6, 4]],
# [[6, 2, 7],
# [3, 2, 4],
# [5, 3, 5],
# [6, 2, 4]],
# [[7, 2, 2],
# [6, 3, 2],
# [6, 4, 3],
# [6, 5, 6]]]
# tensor = torch.Tensor(li)
# norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
# print(norm_tensor, '\n')
# sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
# print(sim_ten/2 + 0.5, '\n')
# print(sim_ten.size())
# multiprocessing.set_start_method("spawn")
# manager = multiprocessing.Manager()
# lock = manager.Lock()
# pool = multiprocessing.Pool(16)
# with manager:
# for _ in tqdm(range(0, 1000)):
# result = pool.apply_async(fuck, args=(_,))
# print(result)

Loading…
Cancel
Save