|
|
|
@ -1,3 +1,5 @@
|
|
|
|
|
import operator
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import copy
|
|
|
|
@ -13,6 +15,8 @@ def is_minimal(md, md_list, target_col):
|
|
|
|
|
return True
|
|
|
|
|
minimal = True
|
|
|
|
|
for _ in md_list:
|
|
|
|
|
if isinstance(_, tuple):
|
|
|
|
|
_ = _[0]
|
|
|
|
|
if _ != md:
|
|
|
|
|
other_cols = list(set(_.keys()) - {target_col})
|
|
|
|
|
# 假设列表中每一个md都使当前md不minimal
|
|
|
|
@ -34,6 +38,7 @@ def is_minimal(md, md_list, target_col):
|
|
|
|
|
|
|
|
|
|
def pairs_inference(path, target_col, conf: Configuration):
|
|
|
|
|
simt = conf["similarity_thresh"]
|
|
|
|
|
# simt = round(simt, ndigits=3)
|
|
|
|
|
supt = conf["support_thresh"]
|
|
|
|
|
cont = conf["confidence_thresh"]
|
|
|
|
|
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
|
|
|
|
@ -55,6 +60,7 @@ def pairs_inference(path, target_col, conf: Configuration):
|
|
|
|
|
table_tensor = torch.stack(split_embedding, dim=0, out=None)
|
|
|
|
|
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
|
|
|
|
|
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
|
|
|
|
|
# sim_tensor = torch.round(sim_tensor, decimals=3)
|
|
|
|
|
|
|
|
|
|
# torch.save(sim_tensor, md_output_dir + "tensor.pt")
|
|
|
|
|
|
|
|
|
@ -65,7 +71,7 @@ def pairs_inference(path, target_col, conf: Configuration):
|
|
|
|
|
init_md[col] = 1 if col == target_col else -1
|
|
|
|
|
md_list.append(init_md)
|
|
|
|
|
|
|
|
|
|
for row1 in range(0, length - 1):
|
|
|
|
|
for row1 in tqdm(range(0, length - 1)):
|
|
|
|
|
terminate = False
|
|
|
|
|
for row2 in range(row1 + 1, length):
|
|
|
|
|
violated_mds = []
|
|
|
|
@ -126,13 +132,11 @@ def pairs_inference(path, target_col, conf: Configuration):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if len(minimal_vio) > 0:
|
|
|
|
|
remove_list = []
|
|
|
|
|
for md in minimal_vio:
|
|
|
|
|
for md in minimal_vio[:]:
|
|
|
|
|
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
|
|
|
|
|
if confidence < cont:
|
|
|
|
|
remove_list.append(md)
|
|
|
|
|
for _ in remove_list:
|
|
|
|
|
minimal_vio.remove(_)
|
|
|
|
|
if support >= supt and confidence >= cont:
|
|
|
|
|
minimal_vio.append((md, support, confidence))
|
|
|
|
|
minimal_vio.remove(md)
|
|
|
|
|
|
|
|
|
|
if len(md_list) > 0:
|
|
|
|
|
# 去除重复MD
|
|
|
|
@ -142,32 +146,32 @@ def pairs_inference(path, target_col, conf: Configuration):
|
|
|
|
|
tmp.append(_)
|
|
|
|
|
md_list = tmp
|
|
|
|
|
# 去除support小于阈值MD
|
|
|
|
|
md_rm_list = []
|
|
|
|
|
for _ in md_list:
|
|
|
|
|
for _ in md_list[:]:
|
|
|
|
|
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
|
|
|
|
|
if support < supt:
|
|
|
|
|
md_rm_list.append(_)
|
|
|
|
|
for _ in md_rm_list:
|
|
|
|
|
if support >= supt and confidence >= cont:
|
|
|
|
|
md_list.append((_, support, confidence))
|
|
|
|
|
md_list.remove(_)
|
|
|
|
|
# 去除不minimal的MD
|
|
|
|
|
for _ in md_list[:]:
|
|
|
|
|
if not is_minimal(_, md_list, target_col):
|
|
|
|
|
md_list.remove(_)
|
|
|
|
|
for md_tuple in md_list[:]:
|
|
|
|
|
if not is_minimal(md_tuple[0], md_list, target_col) and md_tuple[2] < 0.5:
|
|
|
|
|
md_list.remove(md_tuple)
|
|
|
|
|
if len(minimal_vio) > 0:
|
|
|
|
|
for vio in minimal_vio[:]:
|
|
|
|
|
if not is_minimal(vio, md_list, target_col):
|
|
|
|
|
minimal_vio.remove(vio)
|
|
|
|
|
for vio_tuple in minimal_vio[:]:
|
|
|
|
|
if not is_minimal(vio_tuple[0], md_list, target_col) and vio_tuple[2] < 0.5:
|
|
|
|
|
minimal_vio.remove(vio_tuple)
|
|
|
|
|
|
|
|
|
|
if len(minimal_vio) > 0:
|
|
|
|
|
for _ in minimal_vio[:]:
|
|
|
|
|
if not is_minimal(_, minimal_vio, target_col):
|
|
|
|
|
minimal_vio.remove(_)
|
|
|
|
|
|
|
|
|
|
print(f'\033[33mList Length: {len(md_list)}\033[0m')
|
|
|
|
|
print(f'\033[33mVio Length: {len(minimal_vio)}\033[0m')
|
|
|
|
|
for vio_tuple in minimal_vio[:]:
|
|
|
|
|
if not is_minimal(vio_tuple[0], minimal_vio, target_col) and vio_tuple[2] < 0.5:
|
|
|
|
|
minimal_vio.remove(vio_tuple)
|
|
|
|
|
|
|
|
|
|
result = []
|
|
|
|
|
result.extend(md_list)
|
|
|
|
|
result.extend(minimal_vio)
|
|
|
|
|
result.sort(key=operator.itemgetter(2), reverse=True)
|
|
|
|
|
print(f'\033[33mList Length: {len(result)}\033[0m')
|
|
|
|
|
print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m')
|
|
|
|
|
|
|
|
|
|
return md_list, minimal_vio
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_metrics(current_md, data, sim_tensor, target_col, target_index):
|
|
|
|
@ -181,6 +185,7 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
|
|
|
|
|
md_tensor_3d = md_tensor_2d.unsqueeze(2)
|
|
|
|
|
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
|
|
|
|
|
|
|
|
|
|
sim_tensor = torch.round(sim_tensor, decimals=4)
|
|
|
|
|
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
|
|
|
|
|
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
|
|
|
|
|
for i in range(0, width):
|
|
|
|
@ -189,10 +194,12 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
|
|
|
|
|
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
|
|
|
|
|
sup_tensor_int = ini_slice.int()
|
|
|
|
|
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
|
|
|
|
|
support_Naumann = (support_Naumann - length) / 2
|
|
|
|
|
|
|
|
|
|
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
|
|
|
|
|
conf_tensor_int = ini_slice.int()
|
|
|
|
|
support_Fan = torch.count_nonzero(conf_tensor_int).item()
|
|
|
|
|
confidence = support_Fan / support_Naumann
|
|
|
|
|
support_Fan = (support_Fan - length) / 2
|
|
|
|
|
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
|
|
|
|
|
|
|
|
|
|
return support_Fan, confidence
|
|
|
|
|