You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
matching_dependency/md_discovery/tmp_discover.py

226 lines
7.6 KiB

1 year ago
import multiprocessing
import time
from concurrent.futures import ProcessPoolExecutor
from multiprocessing.managers import SharedMemoryManager
import numpy as np
import pandas as pd
import copy
import torch
from tqdm import tqdm
from settings import model, md_output_dir
1 year ago
conf_thresh = 0.8
def is_minimal(md, md_list, target_col):
1 year ago
# 假设这个md是minimal
if len(md_list) == 0:
return True
minimal = True
for _ in md_list:
if _ != md:
other_cols = list(set(_.keys()) - {target_col})
# 假设列表中每一个md都使当前md不minimal
exist = True
# 如果左边任何一个大于,则假设不成立
for col in other_cols:
if _[col] > md[col]:
exist = False
break
# 如果右边小于,假设也不成立
if _[target_col] < md[target_col]:
exist = False
# 任何一次假设成立当前md不minimal
if exist:
minimal = False
break
return minimal
def pairs_inference(path, threshold, target_col):
1 year ago
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
1 year ago
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor/2 + 0.5
torch.save(sim_tensor, md_output_dir + "tensor.pt")
1 year ago
md_list = []
minimal_vio = []
init_md = {}
for col in columns:
init_md[col] = 1 if col == target_col else 0
md_list.append(init_md)
for row1 in range(0, length - 1):
terminate = False
for row2 in range(row1 + 1, length):
violated_mds = []
# sims是两行的相似度
sims = {}
for col_index in range(0, width):
col = columns[col_index]
similarity = sim_tensor[col_index, row1, row2].item()
sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表
for md in md_list[:]:
lhs_satis = True
rhs_satis = True
for col in cols_but_target:
1 year ago
if sims[col] < md[col]:
lhs_satis = False
break
if sims[target_col] < md[target_col]:
rhs_satis = False
if lhs_satis == True and rhs_satis == False:
md_list.remove(md)
violated_mds.append(md)
for vio_md in violated_mds:
# 特殊化左侧
for col in cols_but_target:
if sims[col] + 0.01 <= 1:
1 year ago
spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if is_minimal(spec_l_md, md_list, target_col):
1 year ago
md_list.append(spec_l_md)
if vio_md not in minimal_vio:
minimal_vio.append(vio_md)
if len(md_list) == 0:
terminate = True
break
if terminate:
break
if len(md_list) > 0:
for vio in minimal_vio[:]:
if not is_minimal(vio, md_list, target_col):
1 year ago
minimal_vio.remove(vio)
print('mds_list\t', len(md_list), '\n')
print('vio_list\t', len(minimal_vio), '\n')
1 year ago
if len(minimal_vio) == 0:
return md_list, []
remove_list = []
# fuck = []
for md in minimal_vio:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
# fuck.append((support, confidence))
if support < 1:
print('delete by support')
remove_list.append(md)
if confidence < 0.5:
print('delete by confidence')
remove_list.append(md)
# fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
for _ in remove_list:
minimal_vio.remove(_)
1 year ago
for _ in minimal_vio[:]:
if not is_minimal(_, minimal_vio, target_col):
1 year ago
minimal_vio.remove(_)
print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
1 year ago
return md_list, minimal_vio
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
columns = data.columns.values.tolist()
length = data.shape[0]
width = data.shape[1]
md_tensor = list(current_md.values())
md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support = torch.count_nonzero(sup_tensor_int).item()
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = ini_slice.int()
confidence_numerator = torch.count_nonzero(conf_tensor_int).item()
confidence = confidence_numerator / support
return support, confidence
1 year ago
def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
manager = multiprocessing.Manager()
if len(md_list) == 0:
return []
pool_size = 16
pool = multiprocessing.Pool(pool_size)
result = []
with manager:
for _ in md_list:
task = pool.apply_async(get_one_md_metadata, args=(_, data, sim_tensor, target_col))
support, confidence = task.get()
result.append({"md": _, "support": support, "confidence": confidence})
pool.close()
pool.join()
return result
def get_one_md_metadata(md, dataframe, sim_tensor, target_col):
support = 0
pre_confidence = 0
columns = dataframe.columns.values.tolist()
length = dataframe.shape[0]
width = dataframe.shape[1]
for row1 in range(0, length - 1):
for row2 in range(row1 + 1, length):
left_satisfy = True
both_satisfy = True
for col_index in range(0, width):
col = columns[col_index]
sim = sim_tensor[col_index, row1, row2].item()
if col == target_col:
if sim < 1:
both_satisfy = False
else:
if sim < md[col]:
left_satisfy = False
both_satisfy = False
if left_satisfy:
support += 1
if both_satisfy:
pre_confidence += 1
confidence = 0 if support == 0 else pre_confidence / support
# return {"md": md, "support": support, "confidence": confidence}
return support, confidence