diff --git a/md_discovery/md_mining.py b/md_discovery/md_mining.py index 8405be4..f949926 100644 --- a/md_discovery/md_mining.py +++ b/md_discovery/md_mining.py @@ -1,3 +1,5 @@ +import random +import operator import pandas as pd import torch import matplotlib.pyplot as plt @@ -17,6 +19,7 @@ def mining(train: pd.DataFrame): data = train.drop(columns=['_id', 'ltable_' + ltable_id, 'rtable_' + rtable_id], inplace=False) # data中现存属性:除key以外左右表属性和gold, 不含_id columns = data.columns.values.tolist() + columns_without_prefix = [_.replace('ltable_', '') for _ in columns if _.startswith('ltable_')] # 列表, 每个元素为二元组, 包含对应列的索引 col_tuple_list = build_col_tuple_list(columns) @@ -61,9 +64,6 @@ def mining(train: pd.DataFrame): # 根据索引将匹配的行标签置为1 sim_table_tensor_labeled[match_pair_indices, -1] = 1.00 - md_list = init_md_list(len(col_tuple_list)) - result_md_list = [] - sorted_unique_value_tensor_list = [] for _ in range(len(col_tuple_list)): # 将sim_table_tensor每一列的值从小到大排列,加入列表 @@ -72,46 +72,58 @@ def mining(train: pd.DataFrame): sorted_unique_value_tensor = sorted_unique_value_tensor[sorted_unique_value_tensor >= 0] sorted_unique_value_tensor_list.append(sorted_unique_value_tensor) + # 随机生成候选MD, 形成一个二维张量, 每一行代表一个候选MD + candidate_mds_tensor = build_candidate_md_matrix(sorted_unique_value_tensor_list) result_list = [] + # 遍历每一个MD + for _ in tqdm(range(candidate_mds_tensor.shape[0])): + # 对每一个MD加一个0.5的标记, 意为match + md_tensor_labeled = torch.cat((candidate_mds_tensor[_], torch.tensor([0.5], device='cuda')), 0) + abs_support, confidence = get_metrics(md_tensor_labeled, sim_table_tensor_labeled) + if abs_support >= support_threshold and confidence >= confidence_threshold: + md_list_format = [round(i, 2) for i in candidate_mds_tensor[_].tolist()] + md_dict_format = {} + for k in range(0, len(columns_without_prefix)): + md_dict_format[columns_without_prefix[k]] = md_list_format[k] + result_list.append((md_dict_format, abs_support, confidence)) + result_list.sort(key=operator.itemgetter(2), reverse=True) + mds_to_txt(result_list) + return result_list + # 遍历MD列表, 将满足的直接加入结果列表, 不满足的看能否收紧, 不能收紧直接跳过 # 若能收紧则将收紧后的一个个加入暂存列表, 并在该轮遍历结束后替换MD列表, 直到MD列表为空 - while len(md_list) > 0: - tmp_list = [] - for md_tensor in tqdm(md_list): - md_tensor_labeled = torch.cat((md_tensor, torch.tensor([0.5], device='cuda')), 0) - abs_support, confidence = get_metrics(md_tensor_labeled, sim_table_tensor_labeled) - # 如果support小于1, 没必要收紧阈值, 跳过 - if abs_support >= 1: - # 如果support满足但confidence不满足, 需要收紧阈值 - if confidence < confidence_threshold: - for _ in range(len(md_tensor)): - new_md_tensor = md_tensor.clone() - if new_md_tensor[_] == -1.00: - new_md_tensor[_] = sorted_unique_value_tensor_list[_][0] - if len(tmp_list) == 0: - tmp_list.append(new_md_tensor) - else: - stacked_tmp_tensors = torch.stack(tmp_list) - is_contained = (stacked_tmp_tensors == new_md_tensor) .all(dim=1).any() - if not is_contained: - tmp_list.append(new_md_tensor) - else: - a_tensor = sorted_unique_value_tensor_list[_] - b_value = new_md_tensor[_] - next_index = torch.where(a_tensor == b_value)[0].item() + 1 - if next_index < len(a_tensor): - new_md_tensor[_] = a_tensor[next_index] - tmp_list.append(new_md_tensor) - # torch.where(sorted_unique_value_tensor_list[2] == 0.16)[0].item() - # 如果都满足, 直接加进结果列表 - else: - result_list.append(md_tensor) - md_list = tmp_list - - print(1) - # sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2)) - # sim_tensor = sim_tensor.float() - # sim_tensor = torch.round(sim_tensor, decimals=4) + # while len(md_list) > 0: + # tmp_list = [] + # for md_tensor in tqdm(md_list): + # md_tensor_labeled = torch.cat((md_tensor, torch.tensor([0.5], device='cuda')), 0) + # abs_support, confidence = get_metrics(md_tensor_labeled, sim_table_tensor_labeled) + # # 如果support小于1, 没必要收紧阈值, 跳过 + # if abs_support >= 1: + # # 如果support满足但confidence不满足, 需要收紧阈值 + # if confidence < confidence_threshold: + # for _ in range(len(md_tensor)): + # new_md_tensor = md_tensor.clone() + # if new_md_tensor[_] == -1.00: + # new_md_tensor[_] = sorted_unique_value_tensor_list[_][0] + # if len(tmp_list) == 0: + # tmp_list.append(new_md_tensor) + # else: + # stacked_tmp_tensors = torch.stack(tmp_list) + # is_contained = (stacked_tmp_tensors == new_md_tensor) .all(dim=1).any() + # if not is_contained: + # tmp_list.append(new_md_tensor) + # else: + # a_tensor = sorted_unique_value_tensor_list[_] + # b_value = new_md_tensor[_] + # next_index = torch.where(a_tensor == b_value)[0].item() + 1 + # if next_index < len(a_tensor): + # new_md_tensor[_] = a_tensor[next_index] + # tmp_list.append(new_md_tensor) + # # torch.where(sorted_unique_value_tensor_list[2] == 0.16)[0].item() + # # 如果都满足, 直接加进结果列表 + # else: + # result_list.append(md_tensor) + # md_list = tmp_list def build_col_tuple_list(columns_): @@ -124,13 +136,13 @@ def build_col_tuple_list(columns_): return col_tuple_list_ -def init_md_list(md_dimension: int): - md_list_ = [] - # 创建全为-1的初始MD, 保留两位小数 - init_md_tensor = torch.full((md_dimension, ), -1.0, device='cuda') - init_md_tensor = torch.round(init_md_tensor, decimals=2) - md_list_.append(init_md_tensor) - return md_list_ +# def init_md_list(md_dimension: int): +# md_list_ = [] +# # 创建全为-1的初始MD, 保留两位小数 +# init_md_tensor = torch.full((md_dimension, ), -1.0, device='cuda') +# init_md_tensor = torch.round(init_md_tensor, decimals=2) +# md_list_.append(init_md_tensor) +# return md_list_ def get_metrics(md_tensor_labeled_, sim_table_tensor_labeled_): @@ -155,3 +167,42 @@ def get_metrics(md_tensor_labeled_, sim_table_tensor_labeled_): confidence_ = abs_strict_support_ / abs_support_ if abs_support_ > 0 else 0 return abs_support_, confidence_ + +# 随机生成MD, 拼成一个矩阵, 每一行代表一条MD +def build_candidate_md_matrix(sorted_unique_value_tensor_list_: list): + # 假设先随机抽取20000条 + length_ = len(sorted_unique_value_tensor_list_) + N = 20000 + # 对于第一列所有相似度取值, 随机有放回地抽取N个, 生成行索引 + indices = torch.randint(0, len(sorted_unique_value_tensor_list_[0]), (N, 1)) + # 为每一列生成一个索引张量, 表示从相应列张量中随机选择的值的索引 + for _ in range(1, length_): + indices = torch.cat((indices, torch.randint(0, len(sorted_unique_value_tensor_list_[_]), (N, 1))), dim=1) + # 使用生成的索引从每个列相似度张量中选取值, 构成新的张量 + candidate_md_matrix_list = [] + for _ in range(length_): + candidate_md_matrix_list.append(sorted_unique_value_tensor_list_[_][indices[:, _].long()].unsqueeze(1)) + candidate_md_matrix_ = torch.cat(candidate_md_matrix_list, dim=1) + + # 此tensor将与其他置为-1的tensor拼接 + joint_candidate_md_matrix_ = candidate_md_matrix_.clone() + # 随机将1列, 2列......, M-1列置为-1 + for i in range(length_ - 1): + index_list_format = [] + for j in range(candidate_md_matrix_.shape[0]): + # 对每条MD,随机选择将要置为-1的列索引 + index_list_format.append(random.sample([_ for _ in range(0, length_)], i + 1)) + index = torch.tensor(index_list_format, device='cuda') + # 随机调整为-1后的MD集合 + modified_candidate = candidate_md_matrix_.scatter(1, index, -1) + joint_candidate_md_matrix_ = torch.cat((joint_candidate_md_matrix_, modified_candidate), 0) + joint_candidate_md_matrix_ = joint_candidate_md_matrix_.unique(dim=0) + return joint_candidate_md_matrix_ + + +def mds_to_txt(result_list_): + p = md_output_dir + "mds.txt" + with open(p, 'w') as f: + for _ in result_list_: + f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}') + f.write('\n') diff --git a/ml_er/magellan_new.py b/ml_er/magellan_new.py index 14d27d0..7b43ff0 100644 --- a/ml_er/magellan_new.py +++ b/ml_er/magellan_new.py @@ -26,16 +26,18 @@ def blocking_mining(): blocker = em.OverlapBlocker() candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True, - l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1, + l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=-1, overlap_size=1, show_progress=False) candidate['gold'] = 0 candidate = candidate.reset_index(drop=True) + block_time = time.time() + print(f'Block Time: {block_time - start}') # 根据mapping表标注数据 candidate_match_rows = [] for t in tqdm(mappings.itertuples()): - mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, 'ltable_id')])) & - (candidate['rtable_' + rtable_id].isin([getattr(t, 'rtable_id')]))) + mask = ((candidate['ltable_' + ltable_id].isin([getattr(t, mapping_lid)])) & + (candidate['rtable_' + rtable_id].isin([getattr(t, mapping_rid)]))) matching_indices = candidate[mask].index candidate_match_rows.extend(matching_indices.tolist()) match_rows_mask = candidate.index.isin(candidate_match_rows) @@ -60,10 +62,12 @@ def blocking_mining(): sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0) train_set = sets['train'] test_set = sets['test'] - end_blocking = time.time() - print(end_blocking - start) + label_and_split_time = time.time() + print(f'Label and Split Time: {label_and_split_time - block_time}') mining(train_set) + mining_time = time.time() + print(f'Mining Time: {mining_time - label_and_split_time}') return 1 diff --git a/settings.py b/settings.py index 7ebfeb8..3219095 100644 --- a/settings.py +++ b/settings.py @@ -1,12 +1,12 @@ from sentence_transformers import SentenceTransformer -ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableA.csv' -rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableB.csv' -mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\matches.csv' -mapping_lid = 'ltable_id' # mapping表中左表id名 -mapping_rid = 'rtable_id' # mapping表中右表id名 -ltable_block_attr = 'name' -rtable_block_attr = 'name' +ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableA.csv' +rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv' +mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\matches.csv' +mapping_lid = 'idDBLP' # mapping表中左表id名 +mapping_rid = 'idScholar' # mapping表中右表id名 +ltable_block_attr = 'title' +rtable_block_attr = 'title' ltable_id = 'id' # 左表id字段名称 rtable_id = 'id' # 右表id字段名称 target_attr = 'id' # 进行md挖掘时的目标字段 @@ -16,7 +16,7 @@ model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2') interpre_weight = 1 # 可解释性权重 similarity_threshold = 0.1 support_threshold = 1 -confidence_threshold = 0.6 +confidence_threshold = 0.75 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'