import math import operator import random import time from tqdm import tqdm import numpy as np import pandas as pd import torch from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir sample_number = 100000 step_length = 0.01 def get_metrics(md_tensor, data, sim_tensor, target_index): length = data.shape[0] width = data.shape[1] # md_tensor = list(current_md.values()) # md_tensor = torch.tensor(md_tensor, device='cuda') md_tensor_2d = md_tensor.unsqueeze(1) md_tensor_3d = md_tensor_2d.unsqueeze(2) md_tensor_3d = md_tensor_3d.repeat(1, length, length) sim_tensor = torch.round(sim_tensor, decimals=4) sup_tensor = torch.ge(sim_tensor, md_tensor_3d) ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') for i in range(0, width): if i != target_index: sup_tensor_slice = sup_tensor[i] ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) sup_tensor_int = ini_slice.int() support_Naumann = torch.count_nonzero(sup_tensor_int).item() support_Naumann = (support_Naumann - length) / 2 conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index]) conf_tensor_int = conf_tensor.int() support_Fan = torch.count_nonzero(conf_tensor_int).item() support_Fan = (support_Fan - length) / 2 confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0 return support_Fan, confidence def build_cartesian(width, target_index): all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True, num=math.ceil((1-similarity_threshold)/step_length) + 1) all_values_array = np.round(all_values_array, 4) all_values_tensor = torch.tensor(all_values_array, device='cuda') all_values_tensor = all_values_tensor.float() all_values_tensor = torch.round(all_values_tensor, decimals=4) tensors_for_cartesian = [] for i in range(0, width): if i == target_index: t = torch.tensor([1.0], device='cuda') tensors_for_cartesian.append(t) else: tensors_for_cartesian.append(all_values_tensor) result = torch.cartesian_prod(*tensors_for_cartesian) return result def get_metric_tensor(cartesian_product, data, sim_tensor, target_index): length = data.shape[0] width = data.shape[1] cartesian_product = cartesian_product.unsqueeze(2) cartesian_product = cartesian_product.unsqueeze(3) cartesian_product = cartesian_product.repeat(1, 1, length, length) def discover(path, target_col): data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data.fillna("", inplace=True) data = data.astype(str) columns = data.columns.values.tolist() target_index = columns.index(target_col) cols_but_target = list(set(columns) - {target_col}) length = data.shape[0] width = data.shape[1] # 除了目标列外所有列的索引 columns_indices = [_ for _ in range(0, width) if _ != target_index] sentences = [] for col in range(0, width): for row in range(0, length): cell_value = data.values[row, col] sentences.append(cell_value) embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") split_embedding = torch.split(embedding, length, dim=0) table_tensor = torch.stack(split_embedding, dim=0, out=None) norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) sim_tensor = sim_tensor.float() sim_tensor = torch.round(sim_tensor, decimals=4) # 小于6列的可以尝试做笛卡尔积,大于6列可能指数爆炸 if width < 6: # 列出除目标列以外所有列的所有取值,做笛卡尔积,结果为所有可能MD取值 cartesian = build_cartesian(width, target_index) # 抽取sample_number / (width - 1)条MD,不含-1 if cartesian.shape[0] > sample_number / (width - 1): index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda') cartesian = torch.index_select(cartesian, 0, index) else: # 随机生成sample_number / (width - 1)条MD,使用randint先转化为int再除成小数,不含-1 cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100, (math.ceil(sample_number / (width - 1)), width - 1), device='cuda') cartesian = cartesian / 100 # 生成一列相似度为1的目标列,插入目标列所在位置 ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda') cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1) cartesian = torch.round(cartesian, decimals=4) # 此tensor将与其他置为-1的tensor拼接 joint_md_tensor = cartesian.clone() # 随机将1列,2列……置为-1 for i in range(width - 2): index_list_format = [] for j in range(cartesian.shape[0]): # 对每条MD,随机选择将要置为-1的列索引 index_list_format.append(random.sample(columns_indices, i + 1)) index = torch.tensor(index_list_format, device='cuda') # 随机调整为-1后的MD集合 modified_cartesian = cartesian.scatter(1, index, -1) joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0) md_list = [] # get_metric_tensor(cartesian, data, sim_tensor, target_index) for _ in tqdm(range(joint_md_tensor.shape[0])): s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index) if s >= support_threshold and c >= confidence_threshold: md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()] md_dict_format = {} for k in range(0, width): md_dict_format[columns[k]] = md_list_format[k] md_list.append((md_dict_format, s, c)) md_list.sort(key=operator.itemgetter(2), reverse=True) return md_list