You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
matching_dependency/md_discovery/md_mining.py

158 lines
7.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch import LongTensor
from tqdm import tqdm
from settings import *
# note 对表进行嵌入时定位了有空值的cell, 计算相似度时有空值则置为-1.0000
def mining(train: pd.DataFrame):
# data is train set, in which each row represents a tuple pair
train = train.astype(str)
# 尝试不将左右表key手动调整相同而是只看gold属性是否为1
# 故将左右表key直接去除
data = train.drop(columns=['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id], inplace=False)
# data中现存属性除key以外左右表属性和gold, 不含_id
columns = data.columns.values.tolist()
# 列表, 每个元素为二元组, 包含对应列的索引
col_tuple_list = build_col_tuple_list(columns)
length = data.shape[0]
width = data.shape[1]
# 嵌入data每一个cell, 纵向遍历
# note 此处已重设索引
data = data.reset_index(drop=True)
sentences = data.values.flatten(order='F').tolist()
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda", batch_size=256, show_progress_bar=True)
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
# sim_tensor_dict = {}
sim_tensor_list = []
for col_tuple in col_tuple_list:
mask = ((data[columns[col_tuple[0]]].isin([''])) | (data[columns[col_tuple[1]]].isin([''])))
empty_string_indices = data[mask].index.tolist()
lattr_tensor = norm_table_tensor[col_tuple[0]]
rattr_tensor = norm_table_tensor[col_tuple[1]]
mul_tensor = lattr_tensor * rattr_tensor
sim_tensor = torch.sum(mul_tensor, 1)
# 将有空字符串的位置强制置为-1.0000
sim_tensor = sim_tensor.scatter(0, torch.tensor(empty_string_indices, device='cuda').long(), -1.0000)
sim_tensor = torch.round(sim_tensor, decimals=2)
sim_tensor_list.append(sim_tensor.unsqueeze(1))
# sim_tensor_dict[columns[col_tuple[0]].replace('ltable_', '')] = sim_tensor
sim_table_tensor = torch.cat(sim_tensor_list, dim=1)
# 创建一个1列的tensor长度与相似度张量相同先初始化为全0
label_tensor = torch.zeros((sim_table_tensor.size(0), 1), device='cuda')
# 生成带标签的相似度张量
sim_table_tensor_labeled = torch.cat((sim_table_tensor, label_tensor), 1)
# 找到匹配元组对的行索引
mask = (data['gold'].isin(['1']))
match_pair_indices = data[mask].index.tolist()
# 根据索引将匹配的行标签置为1
sim_table_tensor_labeled[match_pair_indices, -1] = 1.00
md_list = init_md_list(len(col_tuple_list))
result_md_list = []
sorted_unique_value_tensor_list = []
for _ in range(len(col_tuple_list)):
# 将sim_table_tensor每一列的值从小到大排列加入列表
sorted_unique_value_tensor = torch.sort(sim_table_tensor[:, _].unique()).values
# 将每一列可能的相似度取值中小于0的都删掉
sorted_unique_value_tensor = sorted_unique_value_tensor[sorted_unique_value_tensor >= 0]
sorted_unique_value_tensor_list.append(sorted_unique_value_tensor)
result_list = []
# 遍历MD列表, 将满足的直接加入结果列表, 不满足的看能否收紧, 不能收紧直接跳过
# 若能收紧则将收紧后的一个个加入暂存列表, 并在该轮遍历结束后替换MD列表, 直到MD列表为空
while len(md_list) > 0:
tmp_list = []
for md_tensor in tqdm(md_list):
md_tensor_labeled = torch.cat((md_tensor, torch.tensor([0.5], device='cuda')), 0)
abs_support, confidence = get_metrics(md_tensor_labeled, sim_table_tensor_labeled)
# 如果support小于1, 没必要收紧阈值, 跳过
if abs_support >= 1:
# 如果support满足但confidence不满足, 需要收紧阈值
if confidence < confidence_threshold:
for _ in range(len(md_tensor)):
new_md_tensor = md_tensor.clone()
if new_md_tensor[_] == -1.00:
new_md_tensor[_] = sorted_unique_value_tensor_list[_][0]
if len(tmp_list) == 0:
tmp_list.append(new_md_tensor)
else:
stacked_tmp_tensors = torch.stack(tmp_list)
is_contained = (stacked_tmp_tensors == new_md_tensor) .all(dim=1).any()
if not is_contained:
tmp_list.append(new_md_tensor)
else:
a_tensor = sorted_unique_value_tensor_list[_]
b_value = new_md_tensor[_]
next_index = torch.where(a_tensor == b_value)[0].item() + 1
if next_index < len(a_tensor):
new_md_tensor[_] = a_tensor[next_index]
tmp_list.append(new_md_tensor)
# torch.where(sorted_unique_value_tensor_list[2] == 0.16)[0].item()
# 如果都满足, 直接加进结果列表
else:
result_list.append(md_tensor)
md_list = tmp_list
print(1)
# sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
# sim_tensor = sim_tensor.float()
# sim_tensor = torch.round(sim_tensor, decimals=4)
def build_col_tuple_list(columns_):
col_tuple_list_ = []
for _ in columns_:
if _.startswith('ltable'):
left_index = columns_.index(_)
right_index = columns_.index(_.replace('ltable_', 'rtable_'))
col_tuple_list_.append((left_index, right_index))
return col_tuple_list_
def init_md_list(md_dimension: int):
md_list_ = []
# 创建全为-1的初始MD, 保留两位小数
init_md_tensor = torch.full((md_dimension, ), -1.0, device='cuda')
init_md_tensor = torch.round(init_md_tensor, decimals=2)
md_list_.append(init_md_tensor)
return md_list_
def get_metrics(md_tensor_labeled_, sim_table_tensor_labeled_):
table_tensor_length = sim_table_tensor_labeled_.size()[0]
# MD原本为列向量, 转置为行向量
md_tensor_labeled_2d = md_tensor_labeled_.unsqueeze(1).transpose(0, 1)
# 沿行扩展1倍(不扩展), 沿列扩展至与相似度表同样长
md_tensor_labeled_2d = md_tensor_labeled_2d.repeat(table_tensor_length, 1)
# 去掉标签列, 判断每一行相似度是否大于等于MD要求, 该张量行数与sim_table_tensor_labeled_相同, 少一列标签列
support_tensor = torch.ge(sim_table_tensor_labeled_[:, :-1], md_tensor_labeled_2d[:, :-1])
# 沿行方向判断support_tensor每一行是否都为True, 行数不变, 压缩为1列
support_tensor = torch.all(support_tensor, dim=1, keepdim=True)
# 统计这个tensor中True的个数, 即为absolute support
abs_support_ = torch.sum(support_tensor).item()
# 保留标签列, 判断每一行相似度是否大于等于MD要求
support_tensor = torch.ge(sim_table_tensor_labeled_, md_tensor_labeled_2d)
# 统计既满足相似度要求也匹配的, abs_strict_support表示左右都满足的个数
support_tensor = torch.all(support_tensor, dim=1, keepdim=True)
abs_strict_support_ = torch.sum(support_tensor).item()
# 计算confidence
confidence_ = abs_strict_support_ / abs_support_ if abs_support_ > 0 else 0
return abs_support_, confidence_