From c1e07eabb60971e52f96331b977995bc737113a3 Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Mon, 29 Apr 2024 12:52:01 +0800 Subject: [PATCH] commit --- .gitignore | 2 - draw/draw_f1_inter_bars.py | 1 + generate_matches.py | 21 ----- hpo/magellan_hpo.py | 27 ++++--- md_discovery/md_mining.py | 43 ++++++++++ ml_er/magellan_new.py | 3 +- set_none.py | 27 ------- settings.py | 14 ++-- table_embedding.py | 13 --- tfile.py | 159 ------------------------------------- 10 files changed, 69 insertions(+), 241 deletions(-) delete mode 100644 generate_matches.py delete mode 100644 set_none.py delete mode 100644 table_embedding.py delete mode 100644 tfile.py diff --git a/.gitignore b/.gitignore index dcb64f0..8fb99df 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,3 @@ tfile.py table_embedding.py set_none.py generate_matches.py -ml_er/fuck.py - diff --git a/draw/draw_f1_inter_bars.py b/draw/draw_f1_inter_bars.py index 48aae8c..59ad218 100644 --- a/draw/draw_f1_inter_bars.py +++ b/draw/draw_f1_inter_bars.py @@ -39,6 +39,7 @@ if __name__ == '__main__': .set_global_opts( xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)), title_opts=opts.TitleOpts(title="各数据集F1值与预测结果可解释比例", subtitle="可解释性比重:0.5"), + # yaxis_opts=opts.AxisOpts(min_=0.95) # 设置y轴起始点 ) .render("output/F1_Inter_bars.html") ) diff --git a/generate_matches.py b/generate_matches.py deleted file mode 100644 index c049189..0000000 --- a/generate_matches.py +++ /dev/null @@ -1,21 +0,0 @@ -import csv - -import pandas as pd -import json -import sentence_transformers.util -import torch -from sentence_transformers import SentenceTransformer -from torch import nn - -if __name__ == '__main__': - directory = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM_dirty' - train = pd.read_csv(directory + r'\train.csv', encoding='ISO-8859-1') - valid = pd.read_csv(directory + r'\valid.csv', encoding='ISO-8859-1') - test = pd.read_csv(directory + r'\test.csv', encoding='ISO-8859-1') - train = train[train['label'] == 1] - valid = valid[valid['label'] == 1] - test = test[test['label'] == 1] - matches = pd.concat([train, valid, test]) - matches.drop(columns=['label'], inplace=True) - matches = matches.sort_values(by='ltable_id') - matches.to_csv(directory + r'\matches.csv', sep=',', index=False, header=True) diff --git a/hpo/magellan_hpo.py b/hpo/magellan_hpo.py index 1bd051b..c7f26b9 100644 --- a/hpo/magellan_hpo.py +++ b/hpo/magellan_hpo.py @@ -2,7 +2,7 @@ import json import pickle from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float -from ConfigSpace.conditions import InCondition, EqualsCondition +from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction from ConfigSpace.read_and_write import json as csj import py_entitymatching.catalog.catalog_manager as cm import pandas as pd @@ -27,7 +27,7 @@ class Classifier: svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf') svm_C = Integer('svm_C', (1, 100), default=1) svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale') - svm_degree = Integer('svm_degree', (1, 5), default=3) + svm_degree = Integer('svm_degree', (1, 3), default=3) svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0) dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best') @@ -43,15 +43,20 @@ class Classifier: active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt") active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt") active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm") - active_svm_gamma = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm") - active_svm_degree = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm") - active_svm_constant = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm") active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm") - - cs.add_conditions([active_svm_C, active_svm_constant, active_svm_degree, active_svm_gamma, active_svm_kernel, - active_dt_splitter, active_rf_n_estimators, active_dt_max_features, active_rf_max_features, - active_tree_max_depth, active_tree_criterion]) - + active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm") + active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm") + active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm") + + active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"]) + active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly") + active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"]) + + cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators, + active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion, + AndConjunction(active_svm_gamma1, active_svm_gamma2), + AndConjunction(active_svm_degree1, active_svm_degree2), + AndConjunction(active_svm_constant1, active_svm_constant2)]) return cs def train(self, config: Configuration, seed: int = 0) -> float: @@ -75,7 +80,7 @@ def ml_er_hpo(): cs, crash_cost=1.0, deterministic=True, - n_trials=50, + n_trials=20, n_workers=1 ) diff --git a/md_discovery/md_mining.py b/md_discovery/md_mining.py index 0a5a84e..d0f0c72 100644 --- a/md_discovery/md_mining.py +++ b/md_discovery/md_mining.py @@ -1,3 +1,4 @@ +import itertools import random import operator from operator import itemgetter @@ -91,6 +92,9 @@ def mining(train: pd.DataFrame): # result_list.sort(key=itemgetter(2), reverse=True) # 按confidence->support的优先级排序 result_list.sort(key=itemgetter(2, 1), reverse=True) + result_list = merge_mds(result_list) + result_list.sort(key=itemgetter(2, 1), reverse=True) + # 保存到本地 mds_to_txt(result_list) return result_list @@ -210,3 +214,42 @@ def mds_to_txt(result_list_): for _ in result_list_: f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}') f.write('\n') + + +# 合并一些MD +def merge_mds(md_list_): + # 创建一个空字典用于分组 + grouped_md_tuples = {} + # 遍历三元组并对它们进行分组 + for md_tuple in md_list_: + # 提取Support和Confidence的值作为字典的键 + key = (md_tuple[1], md_tuple[2]) + # 检查键是否已经存在于分组字典中 + if key in grouped_md_tuples: + # 如果存在,将三元组添加到对应的列表中 + grouped_md_tuples[key].append(md_tuple) + else: + # 如果不存在,创建一个新的键值对 + grouped_md_tuples[key] = [md_tuple] + # 不要键只要值 + # 一个二级列表, 每个子列表中MD tuple的support和confidence一样 + grouped_md_tuples = list(grouped_md_tuples.values()) + + for same_sc_list in grouped_md_tuples: + # 创建一个索引列表,用于标记需要删除的元组 + indices_to_remove = [] + # 获取元组列表的长度 + length = len(same_sc_list) + # 遍历元组列表,进行比较和删除操作 + for i in range(length): + for j in range(length): + # 比较两个元组的字典值 + if i != j and all(same_sc_list[i][0][key_] >= same_sc_list[j][0][key_] for key_ in same_sc_list[i][0]): + # 如果同组内一个MD的所有相似度阈值都大于等于另一个MD, 则前者可以删除 + indices_to_remove.append(i) + break # 由于列表大小会变化,跳出内层循环 + # 根据索引列表逆序删除元组,以避免在删除时改变列表大小 + for index in sorted(indices_to_remove, reverse=True): + del same_sc_list[index] + # 二级列表转一级列表 + return list(itertools.chain.from_iterable(grouped_md_tuples)) diff --git a/ml_er/magellan_new.py b/ml_er/magellan_new.py index 61c4926..eeee62d 100644 --- a/ml_er/magellan_new.py +++ b/ml_er/magellan_new.py @@ -34,7 +34,7 @@ def blocking_mining(): blocker = em.OverlapBlocker() candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True, - l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=-1, + l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1, overlap_size=1, show_progress=False) candidate['gold'] = 0 candidate = candidate.reset_index(drop=True) @@ -52,6 +52,7 @@ def blocking_mining(): candidate.loc[match_rows_mask, 'gold'] = 1 candidate.fillna(value="", inplace=True) + # negative样本太多, 采样三倍于positive样本量 candidate_mismatch = candidate[candidate['gold'] == 0] candidate_match = candidate[candidate['gold'] == 1] candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match)) diff --git a/set_none.py b/set_none.py deleted file mode 100644 index 43db872..0000000 --- a/set_none.py +++ /dev/null @@ -1,27 +0,0 @@ -import numpy as np -import pandas as pd - -if __name__ == '__main__': - lt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv', - encoding='ISO-8859-1', sep=',') - rt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv', - encoding='ISO-8859-1', sep=',') - mapping = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv', - encoding='ISO-8859-1', sep=',') - # 删除列 - # lt.drop(columns=['numberOfSeasons', 'numberOfEpisodes', 'birthDate', 'last_air_date', 'release_year', 'runtime', - # 'genre_list', 'origin_country'], inplace=True) - # rt.drop(columns=['job'], inplace=True) - # 列名调整 - # rt = rt.rename(columns={'birthDate': 'birthYear', 'release_year': 'startYear', - # 'last_air_date': 'endYear', 'runtime': 'runtimeMinutes'}) - # 列顺序调整 - # rt = rt[['id', 'title', 'name', 'episodeNumber', 'seasonNumber', 'birthYear', 'endYear', 'startYear', 'genre_list', - # 'runtimeMinutes']] - # lt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv', sep=',', index=False, header=True, quoting=1) - # rt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv', sep=',', index=False, header=True, quoting=1) - # mapping.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv', - # sep=',', index=False, header=True, quoting=1) - print(1) - - diff --git a/settings.py b/settings.py index e550c5d..00eb5bc 100644 --- a/settings.py +++ b/settings.py @@ -1,8 +1,8 @@ from sentence_transformers import SentenceTransformer -ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv' -rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv' -mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv' +ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv' +rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv' +mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv' mapping_lid = 'idAbt' # mapping表中左表id名 mapping_rid = 'idBuy' # mapping表中右表id名 ltable_block_attr = 'name' @@ -13,12 +13,12 @@ target_attr = 'id' # 进行md挖掘时的目标字段 # lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2') -interpre_weight = 1 # 可解释性权重 +interpre_weight = 0 # 可解释性权重 similarity_threshold = 0.1 support_threshold = 1 confidence_threshold = 0.75 -er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' -md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' -hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\' +er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\' +md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\' +hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\' diff --git a/table_embedding.py b/table_embedding.py deleted file mode 100644 index ee4ae89..0000000 --- a/table_embedding.py +++ /dev/null @@ -1,13 +0,0 @@ -import pandas as pd -from sentence_transformers import SentenceTransformer -from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering - -if __name__ == '__main__': - path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv' - table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') - - tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq') - model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq') - encoding = tokenizer(table, return_tensors="pt") - embedding = tokenizer.encode(table, return_tensors="pt") - print(1) diff --git a/tfile.py b/tfile.py deleted file mode 100644 index 2a98cab..0000000 --- a/tfile.py +++ /dev/null @@ -1,159 +0,0 @@ -import json -import multiprocessing -import os -import time - -import ConfigSpace -import numpy as np -import pandas as pd -import torch -from tqdm import tqdm -from ConfigSpace.read_and_write import json as csj -from md_discovery import discovery_executor -from settings import er_output_dir, hpo_output_dir - - -def fuck(i): - i = i * i + 1 - - -def test1(): - li = [[[6, 6, 2], - [2, 4, 6], - [2, 4, 7], - [3, 6, 4]], - [[6, 2, 7], - [3, 2, 4], - [5, 3, 5], - [6, 2, 4]], - [[7, 2, 2], - [6, 3, 2], - [6, 4, 3], - [6, 5, 6]]] - tensor = torch.Tensor(li) - norm_tensor = torch.nn.functional.normalize(tensor, dim=2) - print(norm_tensor, '\n') - sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2)) - print(sim_ten / 2 + 0.5, '\n') - print(sim_ten.size()) - - -def test2(): - multiprocessing.set_start_method("spawn") - manager = multiprocessing.Manager() - lock = manager.Lock() - pool = multiprocessing.Pool(16) - with manager: - for _ in tqdm(range(0, 1000)): - result = pool.apply_async(fuck, args=(_,)) - print(result) - - -def test3(): - dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627} - ll = list(dic.values()) - ten = torch.Tensor(ll) - t = ten.unsqueeze(1) - t = t.unsqueeze(2) - y = t.repeat(1, 742, 742) - print(ten) - print(y) - print(torch.isfinite(ten)) - print(torch.count_nonzero(y).item()) - - -def test4(): - one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool) - print(torch.count_nonzero(one_bool_tensor).item()) - - -def test5(): - ten1 = torch.tensor([[1, 2, 3], - [7, 8, 9]]) - ten2 = torch.tensor([[4, 5, 6], - [11, 12, 15]]) - result = ten1 * ten2 - r = torch.sum(result, 1) - print('\n') - print(result) - print(r) - - -def test6(): - table_tensor = torch.tensor([[[1., 2., 3.], - [4., 5., 6.], - [7., 8., 9.]], - [[1., 2., 3.], - [4., 5., 6.], - [7., 8., 9.]]]) - t = torch.tensor([[1., 2., 3.], - [4., 5., 6.]]) - norm1 = torch.nn.functional.normalize(table_tensor, dim=1) - norm2 = torch.nn.functional.normalize(table_tensor, dim=2) - print('\n') - print(norm1) - print(norm2) - print(t.shape) - - -def test7(): - iterations = 1 - filename_list = os.listdir(er_output_dir) - if len(filename_list) > 0: - for _ in filename_list: - if _.startswith('eval_result'): - iterations = int(_[12:13]) + 1 - print(iterations) - - -def test8(): - with open(hpo_output_dir + "configspace.json", 'r') as load_f: - dict_configspace = json.load(load_f) - str_configspace = json.dumps(dict_configspace) - configspace = csj.read(str_configspace) - - -def test9(): - df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True) - df.to_csv(r'./datasets/s.csv') - d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1') - print(1) - - -def test10(): - rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\ml_er\output\predictions.csv', - encoding='ISO-8859-1') - print(1) - rtable.columns = ["id", "title", "authors", "venue", "year"] - rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv', - sep=',', index=False, header=True, quoting=1) - - -def test11(): - values = { - 'block_attr': 'class', - 'confidence_thresh': 0.2717823249253852, - 'ml_blocker': 'attr_equiv', - 'ml_matcher': 'ln', - 'similarity_thresh': 0.20681820299103484, - 'support_thresh': 129, - } - with open(hpo_output_dir + "incumbent.json", "w") as f: - json.dump(values, f, indent=4) - - -def test12(): - with open(hpo_output_dir + "incumbent.json", 'r') as f: - dic = json.load(f) - for _ in dic.keys(): - print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}') - - -def test13(): - outcome_dir = r'E:\Data\Research\Outcome' - configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens' - datasets_list = os.listdir(outcome_dir) - f = [] - for _ in datasets_list: - f.append(outcome_dir + rf'\{_}' + configs_dir) - print(f)