|
|
|
|
import operator
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import copy
|
|
|
|
|
import torch
|
|
|
|
|
from ConfigSpace import Configuration
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
from settings import model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_minimal(md, md_list, target_col):
|
|
|
|
|
# 假设这个md是minimal
|
|
|
|
|
if len(md_list) == 0:
|
|
|
|
|
return True
|
|
|
|
|
minimal = True
|
|
|
|
|
for _ in md_list:
|
|
|
|
|
if isinstance(_, tuple):
|
|
|
|
|
_ = _[0]
|
|
|
|
|
if _ != md:
|
|
|
|
|
other_cols = list(set(_.keys()) - {target_col})
|
|
|
|
|
# 假设列表中每一个md都使当前md不minimal
|
|
|
|
|
exist = True
|
|
|
|
|
# 如果左边任何一个大于,则假设不成立
|
|
|
|
|
for col in other_cols:
|
|
|
|
|
if _[col] > md[col]:
|
|
|
|
|
exist = False
|
|
|
|
|
break
|
|
|
|
|
# 如果右边小于,假设也不成立
|
|
|
|
|
if _[target_col] < md[target_col]:
|
|
|
|
|
exist = False
|
|
|
|
|
# 任何一次假设成立,当前md不minimal
|
|
|
|
|
if exist:
|
|
|
|
|
minimal = False
|
|
|
|
|
break
|
|
|
|
|
return minimal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pairs_inference(path, target_col, conf: Configuration):
|
|
|
|
|
simt = conf["similarity_thresh"]
|
|
|
|
|
# simt = round(simt, ndigits=3)
|
|
|
|
|
supt = conf["support_thresh"]
|
|
|
|
|
cont = conf["confidence_thresh"]
|
|
|
|
|
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
|
|
|
|
|
data.fillna("", inplace=True)
|
|
|
|
|
data = data.astype(str)
|
|
|
|
|
columns = data.columns.values.tolist()
|
|
|
|
|
target_index = columns.index(target_col)
|
|
|
|
|
cols_but_target = list(set(columns) - {target_col})
|
|
|
|
|
length = data.shape[0]
|
|
|
|
|
width = data.shape[1]
|
|
|
|
|
|
|
|
|
|
sentences = []
|
|
|
|
|
for col in range(0, width):
|
|
|
|
|
for row in range(0, length):
|
|
|
|
|
cell_value = data.values[row, col]
|
|
|
|
|
sentences.append(cell_value)
|
|
|
|
|
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
|
|
|
|
|
split_embedding = torch.split(embedding, length, dim=0)
|
|
|
|
|
table_tensor = torch.stack(split_embedding, dim=0, out=None)
|
|
|
|
|
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
|
|
|
|
|
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
|
|
|
|
|
# sim_tensor = torch.round(sim_tensor, decimals=3)
|
|
|
|
|
|
|
|
|
|
# torch.save(sim_tensor, md_output_dir + "tensor.pt")
|
|
|
|
|
|
|
|
|
|
md_list = []
|
|
|
|
|
minimal_vio = []
|
|
|
|
|
init_md = {}
|
|
|
|
|
for col in columns:
|
|
|
|
|
init_md[col] = 1 if col == target_col else -1
|
|
|
|
|
md_list.append(init_md)
|
|
|
|
|
|
|
|
|
|
for row1 in tqdm(range(0, length - 1)):
|
|
|
|
|
terminate = False
|
|
|
|
|
for row2 in range(row1 + 1, length):
|
|
|
|
|
violated_mds = []
|
|
|
|
|
# sims是两行的相似度
|
|
|
|
|
sims = {}
|
|
|
|
|
for col_index in range(0, width):
|
|
|
|
|
col = columns[col_index]
|
|
|
|
|
similarity = sim_tensor[col_index, row1, row2].item()
|
|
|
|
|
sims[col] = similarity
|
|
|
|
|
|
|
|
|
|
# 寻找violated md,从md列表中删除并加入vio列表
|
|
|
|
|
for md in md_list[:]:
|
|
|
|
|
lhs_satis = True
|
|
|
|
|
rhs_satis = True
|
|
|
|
|
for col in cols_but_target:
|
|
|
|
|
if sims[col] < md[col]:
|
|
|
|
|
lhs_satis = False
|
|
|
|
|
break
|
|
|
|
|
if sims[target_col] < md[target_col]:
|
|
|
|
|
rhs_satis = False
|
|
|
|
|
if lhs_satis == True and rhs_satis == False:
|
|
|
|
|
md_list.remove(md)
|
|
|
|
|
violated_mds.append(md)
|
|
|
|
|
|
|
|
|
|
# for vio_md in violated_mds:
|
|
|
|
|
# # 特殊化左侧
|
|
|
|
|
# for col in cols_but_target:
|
|
|
|
|
# if sims[col] + 0.01 <= 1:
|
|
|
|
|
# spec_l_md = copy.deepcopy(vio_md)
|
|
|
|
|
# spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
|
|
|
|
|
# if is_minimal(spec_l_md, md_list, target_col):
|
|
|
|
|
# md_list.append(spec_l_md)
|
|
|
|
|
# if vio_md not in minimal_vio:
|
|
|
|
|
# minimal_vio.append(vio_md)
|
|
|
|
|
|
|
|
|
|
for vio_md in violated_mds:
|
|
|
|
|
vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
|
|
|
|
|
if vio_md_support >= supt:
|
|
|
|
|
for col in cols_but_target:
|
|
|
|
|
if sims[col] < 1.0:
|
|
|
|
|
spec_l_md = copy.deepcopy(vio_md)
|
|
|
|
|
if sims[col] < simt:
|
|
|
|
|
spec_l_md[col] = simt
|
|
|
|
|
else:
|
|
|
|
|
if sims[col] + 0.01 <= 1.0:
|
|
|
|
|
spec_l_md[col] = sims[col] + 0.01
|
|
|
|
|
else:
|
|
|
|
|
spec_l_md[col] = 1.0
|
|
|
|
|
if is_minimal(spec_l_md, md_list, target_col):
|
|
|
|
|
md_list.append(spec_l_md)
|
|
|
|
|
if vio_md not in minimal_vio:
|
|
|
|
|
minimal_vio.append(vio_md)
|
|
|
|
|
|
|
|
|
|
if len(md_list) == 0:
|
|
|
|
|
terminate = True
|
|
|
|
|
break
|
|
|
|
|
if terminate:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if len(minimal_vio) > 0:
|
|
|
|
|
for md in minimal_vio[:]:
|
|
|
|
|
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
|
|
|
|
|
if support >= supt and confidence >= cont:
|
|
|
|
|
minimal_vio.append((md, support, confidence))
|
|
|
|
|
minimal_vio.remove(md)
|
|
|
|
|
|
|
|
|
|
if len(md_list) > 0:
|
|
|
|
|
# 去除重复MD
|
|
|
|
|
tmp = []
|
|
|
|
|
for _ in md_list:
|
|
|
|
|
if _ not in tmp:
|
|
|
|
|
tmp.append(_)
|
|
|
|
|
md_list = tmp
|
|
|
|
|
# 去除support小于阈值MD
|
|
|
|
|
for _ in md_list[:]:
|
|
|
|
|
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
|
|
|
|
|
if support >= supt and confidence >= cont:
|
|
|
|
|
md_list.append((_, support, confidence))
|
|
|
|
|
md_list.remove(_)
|
|
|
|
|
# 去除不minimal的MD
|
|
|
|
|
for md_tuple in md_list[:]:
|
|
|
|
|
if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5:
|
|
|
|
|
md_list.remove(md_tuple)
|
|
|
|
|
if len(minimal_vio) > 0:
|
|
|
|
|
for vio_tuple in minimal_vio[:]:
|
|
|
|
|
if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5:
|
|
|
|
|
minimal_vio.remove(vio_tuple)
|
|
|
|
|
|
|
|
|
|
if len(minimal_vio) > 0:
|
|
|
|
|
for vio_tuple in minimal_vio[:]:
|
|
|
|
|
if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5:
|
|
|
|
|
minimal_vio.remove(vio_tuple)
|
|
|
|
|
|
|
|
|
|
result = []
|
|
|
|
|
result.extend(md_list)
|
|
|
|
|
result.extend(minimal_vio)
|
|
|
|
|
result.sort(key=operator.itemgetter(2), reverse=True)
|
|
|
|
|
print(f'\033[33mList Length: {len(result)}\033[0m')
|
|
|
|
|
print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m')
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
|
|
|
|
|
columns = data.columns.values.tolist()
|
|
|
|
|
length = data.shape[0]
|
|
|
|
|
width = data.shape[1]
|
|
|
|
|
|
|
|
|
|
md_tensor = list(current_md.values())
|
|
|
|
|
md_tensor = torch.tensor(md_tensor, device='cuda')
|
|
|
|
|
md_tensor_2d = md_tensor.unsqueeze(1)
|
|
|
|
|
md_tensor_3d = md_tensor_2d.unsqueeze(2)
|
|
|
|
|
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
|
|
|
|
|
|
|
|
|
|
sim_tensor = torch.round(sim_tensor, decimals=4)
|
|
|
|
|
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
|
|
|
|
|
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
|
|
|
|
|
for i in range(0, width):
|
|
|
|
|
if i != target_index:
|
|
|
|
|
sup_tensor_slice = sup_tensor[i]
|
|
|
|
|
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
|
|
|
|
|
sup_tensor_int = ini_slice.int()
|
|
|
|
|
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
|
|
|
|
|
support_Naumann = (support_Naumann - length) / 2
|
|
|
|
|
|
|
|
|
|
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
|
|
|
|
|
conf_tensor_int = ini_slice.int()
|
|
|
|
|
support_Fan = torch.count_nonzero(conf_tensor_int).item()
|
|
|
|
|
support_Fan = (support_Fan - length) / 2
|
|
|
|
|
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
|
|
|
|
|
|
|
|
|
|
return support_Fan, confidence
|