You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
matching_dependency/md_discovery/tmp_discover.py

203 lines
7.2 KiB

1 year ago
import multiprocessing
import time
from concurrent.futures import ProcessPoolExecutor
from multiprocessing.managers import SharedMemoryManager
import numpy as np
import pandas as pd
import copy
import torch
from tqdm import tqdm
1 year ago
from settings import model, md_output_dir, confidence_threshold, support_threshold
1 year ago
def is_minimal(md, md_list, target_col):
1 year ago
# 假设这个md是minimal
if len(md_list) == 0:
return True
minimal = True
for _ in md_list:
if _ != md:
other_cols = list(set(_.keys()) - {target_col})
# 假设列表中每一个md都使当前md不minimal
exist = True
# 如果左边任何一个大于,则假设不成立
for col in other_cols:
if _[col] > md[col]:
exist = False
break
# 如果右边小于,假设也不成立
if _[target_col] < md[target_col]:
exist = False
# 任何一次假设成立当前md不minimal
if exist:
minimal = False
break
return minimal
def pairs_inference(path, threshold, target_col):
1 year ago
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
1 year ago
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
torch.save(sim_tensor, md_output_dir + "tensor.pt")
1 year ago
md_list = []
minimal_vio = []
init_md = {}
for col in columns:
init_md[col] = 1 if col == target_col else 0
md_list.append(init_md)
for row1 in range(0, length - 1):
terminate = False
for row2 in range(row1 + 1, length):
violated_mds = []
# sims是两行的相似度
sims = {}
for col_index in range(0, width):
col = columns[col_index]
similarity = sim_tensor[col_index, row1, row2].item()
sims[col] = similarity
# 寻找violated md,从md列表中删除并加入vio列表
for md in md_list[:]:
lhs_satis = True
rhs_satis = True
for col in cols_but_target:
1 year ago
if sims[col] < md[col]:
lhs_satis = False
break
if sims[target_col] < md[target_col]:
rhs_satis = False
if lhs_satis == True and rhs_satis == False:
md_list.remove(md)
violated_mds.append(md)
for vio_md in violated_mds:
# 特殊化左侧
for col in cols_but_target:
if sims[col] + 0.01 <= 1:
1 year ago
spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
if is_minimal(spec_l_md, md_list, target_col):
1 year ago
md_list.append(spec_l_md)
if vio_md not in minimal_vio:
minimal_vio.append(vio_md)
1 year ago
# for vio_md in violated_mds:
# # 特殊化左侧
# for col in cols_but_target:
# if sims[col] < 1:
# spec_l_md = copy.deepcopy(vio_md)
# if sims[col] < threshold:
# spec_l_md[col] = threshold
# else:
# if sims[col] + 0.01 <= 1:
# spec_l_md[col] = sims[col] + 0.01
# else:
# spec_l_md[col] = 1
# if is_minimal(spec_l_md, md_list, target_col):
# md_list.append(spec_l_md)
# if vio_md not in minimal_vio:
# minimal_vio.append(vio_md)
1 year ago
if len(md_list) == 0:
terminate = True
break
if terminate:
break
1 year ago
tmp = []
for _ in md_list:
if _ not in tmp:
tmp.append(_)
md_list = tmp
1 year ago
if len(md_list) > 0:
for vio in minimal_vio[:]:
if not is_minimal(vio, md_list, target_col):
1 year ago
minimal_vio.remove(vio)
1 year ago
for _ in md_list[:]:
if not is_minimal(_, md_list, target_col):
md_list.remove(_)
1 year ago
print('mds_list\t', len(md_list), '\n')
print('vio_list\t', len(minimal_vio), '\n')
1 year ago
if len(minimal_vio) == 0:
return md_list, []
remove_list = []
# fuck = []
1 year ago
if len(md_list) > 0:
md_rm_list = []
for _ in md_list:
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
if support < support_threshold:
md_rm_list.append(_)
for _ in md_rm_list:
md_list.remove(_)
for md in minimal_vio:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
# fuck.append((support, confidence))
1 year ago
if support < support_threshold:
remove_list.append(md)
1 year ago
if confidence < confidence_threshold and md not in remove_list:
remove_list.append(md)
# fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
for _ in remove_list:
minimal_vio.remove(_)
1 year ago
for _ in minimal_vio[:]:
if not is_minimal(_, minimal_vio, target_col):
1 year ago
minimal_vio.remove(_)
print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
1 year ago
return md_list, minimal_vio
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
columns = data.columns.values.tolist()
length = data.shape[0]
width = data.shape[1]
md_tensor = list(current_md.values())
md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support = torch.count_nonzero(sup_tensor_int).item()
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = ini_slice.int()
confidence_numerator = torch.count_nonzero(conf_tensor_int).item()
confidence = confidence_numerator / support
return support, confidence