You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
matching_dependency/md_discovery/discovery_executor_gpu.py

139 lines
6.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import math
import operator
import random
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
sample_number = 100000
step_length = 0.01
def get_metrics(md_tensor, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
# md_tensor = list(current_md.values())
# md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = conf_tensor.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence
def build_cartesian(width, target_index):
all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
num=math.ceil((1-similarity_threshold)/step_length) + 1)
all_values_array = np.round(all_values_array, 4)
all_values_tensor = torch.tensor(all_values_array, device='cuda')
all_values_tensor = all_values_tensor.float()
all_values_tensor = torch.round(all_values_tensor, decimals=4)
tensors_for_cartesian = []
for i in range(0, width):
if i == target_index:
t = torch.tensor([1.0], device='cuda')
tensors_for_cartesian.append(t)
else:
tensors_for_cartesian.append(all_values_tensor)
result = torch.cartesian_prod(*tensors_for_cartesian)
return result
def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
cartesian_product = cartesian_product.unsqueeze(2)
cartesian_product = cartesian_product.unsqueeze(3)
cartesian_product = cartesian_product.repeat(1, 1, length, length)
def discover(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
# 除了目标列外所有列的索引
columns_indices = [_ for _ in range(0, width) if _ != target_index]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor.float()
sim_tensor = torch.round(sim_tensor, decimals=4)
# 小于6列的可以尝试做笛卡尔积大于6列可能指数爆炸
if width < 6:
# 列出除目标列以外所有列的所有取值做笛卡尔积结果为所有可能MD取值
cartesian = build_cartesian(width, target_index)
# 抽取sample_number / (width - 1)条MD不含-1
if cartesian.shape[0] > sample_number / (width - 1):
index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
cartesian = torch.index_select(cartesian, 0, index)
else:
# 随机生成sample_number / (width - 1)条MD使用randint先转化为int再除成小数不含-1
cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
(math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
cartesian = cartesian / 100
# 生成一列相似度为1的目标列插入目标列所在位置
ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
cartesian = torch.round(cartesian, decimals=4)
# 此tensor将与其他置为-1的tensor拼接
joint_md_tensor = cartesian.clone()
# 随机将1列2列……置为-1
for i in range(width - 2):
index_list_format = []
for j in range(cartesian.shape[0]):
# 对每条MD随机选择将要置为-1的列索引
index_list_format.append(random.sample(columns_indices, i + 1))
index = torch.tensor(index_list_format, device='cuda')
# 随机调整为-1后的MD集合
modified_cartesian = cartesian.scatter(1, index, -1)
joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
md_list = []
# get_metric_tensor(cartesian, data, sim_tensor, target_index)
for _ in tqdm(range(joint_md_tensor.shape[0])):
s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
if s >= support_threshold and c >= confidence_threshold:
md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
md_dict_format = {}
for k in range(0, width):
md_dict_format[columns[k]] = md_list_format[k]
md_list.append((md_dict_format, s, c))
md_list.sort(key=operator.itemgetter(2), reverse=True)
return md_list