|
|
import math
|
|
|
import operator
|
|
|
import random
|
|
|
import time
|
|
|
from tqdm import tqdm
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import torch
|
|
|
|
|
|
from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
|
|
|
|
|
|
sample_number = 100000
|
|
|
step_length = 0.01
|
|
|
|
|
|
def get_metrics(md_tensor, data, sim_tensor, target_index):
|
|
|
length = data.shape[0]
|
|
|
width = data.shape[1]
|
|
|
|
|
|
# md_tensor = list(current_md.values())
|
|
|
# md_tensor = torch.tensor(md_tensor, device='cuda')
|
|
|
md_tensor_2d = md_tensor.unsqueeze(1)
|
|
|
md_tensor_3d = md_tensor_2d.unsqueeze(2)
|
|
|
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
|
|
|
|
|
|
sim_tensor = torch.round(sim_tensor, decimals=4)
|
|
|
|
|
|
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
|
|
|
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
|
|
|
for i in range(0, width):
|
|
|
if i != target_index:
|
|
|
sup_tensor_slice = sup_tensor[i]
|
|
|
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
|
|
|
sup_tensor_int = ini_slice.int()
|
|
|
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
|
|
|
support_Naumann = (support_Naumann - length) / 2
|
|
|
|
|
|
conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
|
|
|
conf_tensor_int = conf_tensor.int()
|
|
|
support_Fan = torch.count_nonzero(conf_tensor_int).item()
|
|
|
support_Fan = (support_Fan - length) / 2
|
|
|
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
|
|
|
|
|
|
return support_Fan, confidence
|
|
|
|
|
|
|
|
|
def build_cartesian(width, target_index):
|
|
|
all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
|
|
|
num=math.ceil((1-similarity_threshold)/step_length) + 1)
|
|
|
all_values_array = np.round(all_values_array, 4)
|
|
|
all_values_tensor = torch.tensor(all_values_array, device='cuda')
|
|
|
all_values_tensor = all_values_tensor.float()
|
|
|
all_values_tensor = torch.round(all_values_tensor, decimals=4)
|
|
|
tensors_for_cartesian = []
|
|
|
for i in range(0, width):
|
|
|
if i == target_index:
|
|
|
t = torch.tensor([1.0], device='cuda')
|
|
|
tensors_for_cartesian.append(t)
|
|
|
else:
|
|
|
tensors_for_cartesian.append(all_values_tensor)
|
|
|
result = torch.cartesian_prod(*tensors_for_cartesian)
|
|
|
return result
|
|
|
|
|
|
|
|
|
def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
|
|
|
length = data.shape[0]
|
|
|
width = data.shape[1]
|
|
|
cartesian_product = cartesian_product.unsqueeze(2)
|
|
|
cartesian_product = cartesian_product.unsqueeze(3)
|
|
|
cartesian_product = cartesian_product.repeat(1, 1, length, length)
|
|
|
|
|
|
|
|
|
def discover(path, target_col):
|
|
|
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
|
|
|
data.fillna("", inplace=True)
|
|
|
data = data.astype(str)
|
|
|
columns = data.columns.values.tolist()
|
|
|
target_index = columns.index(target_col)
|
|
|
cols_but_target = list(set(columns) - {target_col})
|
|
|
length = data.shape[0]
|
|
|
width = data.shape[1]
|
|
|
# 除了目标列外所有列的索引
|
|
|
columns_indices = [_ for _ in range(0, width) if _ != target_index]
|
|
|
|
|
|
sentences = []
|
|
|
for col in range(0, width):
|
|
|
for row in range(0, length):
|
|
|
cell_value = data.values[row, col]
|
|
|
sentences.append(cell_value)
|
|
|
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
|
|
|
split_embedding = torch.split(embedding, length, dim=0)
|
|
|
table_tensor = torch.stack(split_embedding, dim=0, out=None)
|
|
|
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
|
|
|
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
|
|
|
sim_tensor = sim_tensor.float()
|
|
|
sim_tensor = torch.round(sim_tensor, decimals=4)
|
|
|
|
|
|
# 小于6列的可以尝试做笛卡尔积,大于6列可能指数爆炸
|
|
|
if width < 6:
|
|
|
# 列出除目标列以外所有列的所有取值,做笛卡尔积,结果为所有可能MD取值
|
|
|
cartesian = build_cartesian(width, target_index)
|
|
|
# 抽取sample_number / (width - 1)条MD,不含-1
|
|
|
if cartesian.shape[0] > sample_number / (width - 1):
|
|
|
index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
|
|
|
cartesian = torch.index_select(cartesian, 0, index)
|
|
|
else:
|
|
|
# 随机生成sample_number / (width - 1)条MD,使用randint先转化为int再除成小数,不含-1
|
|
|
cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
|
|
|
(math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
|
|
|
cartesian = cartesian / 100
|
|
|
# 生成一列相似度为1的目标列,插入目标列所在位置
|
|
|
ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
|
|
|
cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
|
|
|
cartesian = torch.round(cartesian, decimals=4)
|
|
|
# 此tensor将与其他置为-1的tensor拼接
|
|
|
joint_md_tensor = cartesian.clone()
|
|
|
# 随机将1列,2列……置为-1
|
|
|
for i in range(width - 2):
|
|
|
index_list_format = []
|
|
|
for j in range(cartesian.shape[0]):
|
|
|
# 对每条MD,随机选择将要置为-1的列索引
|
|
|
index_list_format.append(random.sample(columns_indices, i + 1))
|
|
|
index = torch.tensor(index_list_format, device='cuda')
|
|
|
# 随机调整为-1后的MD集合
|
|
|
modified_cartesian = cartesian.scatter(1, index, -1)
|
|
|
joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
|
|
|
|
|
|
md_list = []
|
|
|
# get_metric_tensor(cartesian, data, sim_tensor, target_index)
|
|
|
for _ in tqdm(range(joint_md_tensor.shape[0])):
|
|
|
s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
|
|
|
if s >= support_threshold and c >= confidence_threshold:
|
|
|
md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
|
|
|
md_dict_format = {}
|
|
|
for k in range(0, width):
|
|
|
md_dict_format[columns[k]] = md_list_format[k]
|
|
|
md_list.append((md_dict_format, s, c))
|
|
|
md_list.sort(key=operator.itemgetter(2), reverse=True)
|
|
|
return md_list
|