import multiprocessing import time from concurrent.futures import ProcessPoolExecutor from multiprocessing.managers import SharedMemoryManager import numpy as np import pandas as pd import copy import torch from tqdm import tqdm from settings import model, md_output_dir, confidence_threshold, support_threshold def is_minimal(md, md_list, target_col): # 假设这个md是minimal if len(md_list) == 0: return True minimal = True for _ in md_list: if _ != md: other_cols = list(set(_.keys()) - {target_col}) # 假设列表中每一个md都使当前md不minimal exist = True # 如果左边任何一个大于,则假设不成立 for col in other_cols: if _[col] > md[col]: exist = False break # 如果右边小于,假设也不成立 if _[target_col] < md[target_col]: exist = False # 任何一次假设成立,当前md不minimal if exist: minimal = False break return minimal def pairs_inference(path, threshold, target_col): data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data.fillna("", inplace=True) data = data.astype(str) columns = data.columns.values.tolist() target_index = columns.index(target_col) cols_but_target = list(set(columns) - {target_col}) length = data.shape[0] width = data.shape[1] sentences = [] for col in range(0, width): for row in range(0, length): cell_value = data.values[row, col] sentences.append(cell_value) embedding = model.encode(sentences, convert_to_tensor=True, device="cuda") split_embedding = torch.split(embedding, length, dim=0) table_tensor = torch.stack(split_embedding, dim=0, out=None) norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2) sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) torch.save(sim_tensor, md_output_dir + "tensor.pt") md_list = [] minimal_vio = [] init_md = {} for col in columns: init_md[col] = 1 if col == target_col else 0 md_list.append(init_md) for row1 in range(0, length - 1): terminate = False for row2 in range(row1 + 1, length): violated_mds = [] # sims是两行的相似度 sims = {} for col_index in range(0, width): col = columns[col_index] similarity = sim_tensor[col_index, row1, row2].item() sims[col] = similarity # 寻找violated md,从md列表中删除并加入vio列表 for md in md_list[:]: lhs_satis = True rhs_satis = True for col in cols_but_target: if sims[col] < md[col]: lhs_satis = False break if sims[target_col] < md[target_col]: rhs_satis = False if lhs_satis == True and rhs_satis == False: md_list.remove(md) violated_mds.append(md) for vio_md in violated_mds: # 特殊化左侧 for col in cols_but_target: if sims[col] + 0.01 <= 1: spec_l_md = copy.deepcopy(vio_md) spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01 if is_minimal(spec_l_md, md_list, target_col): md_list.append(spec_l_md) if vio_md not in minimal_vio: minimal_vio.append(vio_md) # for vio_md in violated_mds: # # 特殊化左侧 # for col in cols_but_target: # if sims[col] < 1: # spec_l_md = copy.deepcopy(vio_md) # if sims[col] < threshold: # spec_l_md[col] = threshold # else: # if sims[col] + 0.01 <= 1: # spec_l_md[col] = sims[col] + 0.01 # else: # spec_l_md[col] = 1 # if is_minimal(spec_l_md, md_list, target_col): # md_list.append(spec_l_md) # if vio_md not in minimal_vio: # minimal_vio.append(vio_md) if len(md_list) == 0: terminate = True break if terminate: break tmp = [] for _ in md_list: if _ not in tmp: tmp.append(_) md_list = tmp if len(md_list) > 0: for vio in minimal_vio[:]: if not is_minimal(vio, md_list, target_col): minimal_vio.remove(vio) for _ in md_list[:]: if not is_minimal(_, md_list, target_col): md_list.remove(_) print('mds_list\t', len(md_list), '\n') print('vio_list\t', len(minimal_vio), '\n') if len(minimal_vio) == 0: return md_list, [] remove_list = [] # fuck = [] if len(md_list) > 0: md_rm_list = [] for _ in md_list: support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index) if support < support_threshold: md_rm_list.append(_) for _ in md_rm_list: md_list.remove(_) for md in minimal_vio: support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) # fuck.append((support, confidence)) if support < support_threshold: remove_list.append(md) if confidence < confidence_threshold and md not in remove_list: remove_list.append(md) # fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True) for _ in remove_list: minimal_vio.remove(_) for _ in minimal_vio[:]: if not is_minimal(_, minimal_vio, target_col): minimal_vio.remove(_) print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m') return md_list, minimal_vio def get_metrics(current_md, data, sim_tensor, target_col, target_index): columns = data.columns.values.tolist() length = data.shape[0] width = data.shape[1] md_tensor = list(current_md.values()) md_tensor = torch.tensor(md_tensor, device='cuda') md_tensor_2d = md_tensor.unsqueeze(1) md_tensor_3d = md_tensor_2d.unsqueeze(2) md_tensor_3d = md_tensor_3d.repeat(1, length, length) sup_tensor = torch.ge(sim_tensor, md_tensor_3d) ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda') for i in range(0, width): if i != target_index: sup_tensor_slice = sup_tensor[i] ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) sup_tensor_int = ini_slice.int() support = torch.count_nonzero(sup_tensor_int).item() ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index]) conf_tensor_int = ini_slice.int() confidence_numerator = torch.count_nonzero(conf_tensor_int).item() confidence = confidence_numerator / support return support, confidence