MD-metrics-HPO
HuangJintao 7 months ago
parent 9b06ce3840
commit c1e07eabb6

2
.gitignore vendored

@ -6,5 +6,3 @@ tfile.py
table_embedding.py
set_none.py
generate_matches.py
ml_er/fuck.py

@ -39,6 +39,7 @@ if __name__ == '__main__':
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
title_opts=opts.TitleOpts(title="各数据集F1值与预测结果可解释比例", subtitle="可解释性比重0.5"),
# yaxis_opts=opts.AxisOpts(min_=0.95) # 设置y轴起始点
)
.render("output/F1_Inter_bars.html")
)

@ -1,21 +0,0 @@
import csv
import pandas as pd
import json
import sentence_transformers.util
import torch
from sentence_transformers import SentenceTransformer
from torch import nn
if __name__ == '__main__':
directory = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM_dirty'
train = pd.read_csv(directory + r'\train.csv', encoding='ISO-8859-1')
valid = pd.read_csv(directory + r'\valid.csv', encoding='ISO-8859-1')
test = pd.read_csv(directory + r'\test.csv', encoding='ISO-8859-1')
train = train[train['label'] == 1]
valid = valid[valid['label'] == 1]
test = test[test['label'] == 1]
matches = pd.concat([train, valid, test])
matches.drop(columns=['label'], inplace=True)
matches = matches.sort_values(by='ltable_id')
matches.to_csv(directory + r'\matches.csv', sep=',', index=False, header=True)

@ -2,7 +2,7 @@ import json
import pickle
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition, EqualsCondition
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
@ -27,7 +27,7 @@ class Classifier:
svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')
svm_C = Integer('svm_C', (1, 100), default=1)
svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')
svm_degree = Integer('svm_degree', (1, 5), default=3)
svm_degree = Integer('svm_degree', (1, 3), default=3)
svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)
dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')
@ -43,15 +43,20 @@ class Classifier:
active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")
active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")
active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")
active_svm_gamma = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
active_svm_degree = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
active_svm_constant = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")
cs.add_conditions([active_svm_C, active_svm_constant, active_svm_degree, active_svm_gamma, active_svm_kernel,
active_dt_splitter, active_rf_n_estimators, active_dt_max_features, active_rf_max_features,
active_tree_max_depth, active_tree_criterion])
active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"])
active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly")
active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"])
cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators,
active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion,
AndConjunction(active_svm_gamma1, active_svm_gamma2),
AndConjunction(active_svm_degree1, active_svm_degree2),
AndConjunction(active_svm_constant1, active_svm_constant2)])
return cs
def train(self, config: Configuration, seed: int = 0) -> float:
@ -75,7 +80,7 @@ def ml_er_hpo():
cs,
crash_cost=1.0,
deterministic=True,
n_trials=50,
n_trials=20,
n_workers=1
)

@ -1,3 +1,4 @@
import itertools
import random
import operator
from operator import itemgetter
@ -91,6 +92,9 @@ def mining(train: pd.DataFrame):
# result_list.sort(key=itemgetter(2), reverse=True)
# 按confidence->support的优先级排序
result_list.sort(key=itemgetter(2, 1), reverse=True)
result_list = merge_mds(result_list)
result_list.sort(key=itemgetter(2, 1), reverse=True)
# 保存到本地
mds_to_txt(result_list)
return result_list
@ -210,3 +214,42 @@ def mds_to_txt(result_list_):
for _ in result_list_:
f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}')
f.write('\n')
# 合并一些MD
def merge_mds(md_list_):
# 创建一个空字典用于分组
grouped_md_tuples = {}
# 遍历三元组并对它们进行分组
for md_tuple in md_list_:
# 提取Support和Confidence的值作为字典的键
key = (md_tuple[1], md_tuple[2])
# 检查键是否已经存在于分组字典中
if key in grouped_md_tuples:
# 如果存在,将三元组添加到对应的列表中
grouped_md_tuples[key].append(md_tuple)
else:
# 如果不存在,创建一个新的键值对
grouped_md_tuples[key] = [md_tuple]
# 不要键只要值
# 一个二级列表, 每个子列表中MD tuple的support和confidence一样
grouped_md_tuples = list(grouped_md_tuples.values())
for same_sc_list in grouped_md_tuples:
# 创建一个索引列表,用于标记需要删除的元组
indices_to_remove = []
# 获取元组列表的长度
length = len(same_sc_list)
# 遍历元组列表,进行比较和删除操作
for i in range(length):
for j in range(length):
# 比较两个元组的字典值
if i != j and all(same_sc_list[i][0][key_] >= same_sc_list[j][0][key_] for key_ in same_sc_list[i][0]):
# 如果同组内一个MD的所有相似度阈值都大于等于另一个MD, 则前者可以删除
indices_to_remove.append(i)
break # 由于列表大小会变化,跳出内层循环
# 根据索引列表逆序删除元组,以避免在删除时改变列表大小
for index in sorted(indices_to_remove, reverse=True):
del same_sc_list[index]
# 二级列表转一级列表
return list(itertools.chain.from_iterable(grouped_md_tuples))

@ -34,7 +34,7 @@ def blocking_mining():
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=-1,
l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
overlap_size=1, show_progress=False)
candidate['gold'] = 0
candidate = candidate.reset_index(drop=True)
@ -52,6 +52,7 @@ def blocking_mining():
candidate.loc[match_rows_mask, 'gold'] = 1
candidate.fillna(value="", inplace=True)
# negative样本太多, 采样三倍于positive样本量
candidate_mismatch = candidate[candidate['gold'] == 0]
candidate_match = candidate[candidate['gold'] == 1]
candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))

@ -1,27 +0,0 @@
import numpy as np
import pandas as pd
if __name__ == '__main__':
lt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv',
encoding='ISO-8859-1', sep=',')
rt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv',
encoding='ISO-8859-1', sep=',')
mapping = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
encoding='ISO-8859-1', sep=',')
# 删除列
# lt.drop(columns=['numberOfSeasons', 'numberOfEpisodes', 'birthDate', 'last_air_date', 'release_year', 'runtime',
# 'genre_list', 'origin_country'], inplace=True)
# rt.drop(columns=['job'], inplace=True)
# 列名调整
# rt = rt.rename(columns={'birthDate': 'birthYear', 'release_year': 'startYear',
# 'last_air_date': 'endYear', 'runtime': 'runtimeMinutes'})
# 列顺序调整
# rt = rt[['id', 'title', 'name', 'episodeNumber', 'seasonNumber', 'birthYear', 'endYear', 'startYear', 'genre_list',
# 'runtimeMinutes']]
# lt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv', sep=',', index=False, header=True, quoting=1)
# rt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv', sep=',', index=False, header=True, quoting=1)
# mapping.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
# sep=',', index=False, header=True, quoting=1)
print(1)

@ -1,8 +1,8 @@
from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
mapping_lid = 'idAbt' # mapping表中左表id名
mapping_rid = 'idBuy' # mapping表中右表id名
ltable_block_attr = 'name'
@ -13,12 +13,12 @@ target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
interpre_weight = 1 # 可解释性权重
interpre_weight = 0 # 可解释性权重
similarity_threshold = 0.1
support_threshold = 1
confidence_threshold = 0.75
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'

@ -1,13 +0,0 @@
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
if __name__ == '__main__':
path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
encoding = tokenizer(table, return_tensors="pt")
embedding = tokenizer.encode(table, return_tensors="pt")
print(1)

@ -1,159 +0,0 @@
import json
import multiprocessing
import os
import time
import ConfigSpace
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from ConfigSpace.read_and_write import json as csj
from md_discovery import discovery_executor
from settings import er_output_dir, hpo_output_dir
def fuck(i):
i = i * i + 1
def test1():
li = [[[6, 6, 2],
[2, 4, 6],
[2, 4, 7],
[3, 6, 4]],
[[6, 2, 7],
[3, 2, 4],
[5, 3, 5],
[6, 2, 4]],
[[7, 2, 2],
[6, 3, 2],
[6, 4, 3],
[6, 5, 6]]]
tensor = torch.Tensor(li)
norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
print(norm_tensor, '\n')
sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
print(sim_ten / 2 + 0.5, '\n')
print(sim_ten.size())
def test2():
multiprocessing.set_start_method("spawn")
manager = multiprocessing.Manager()
lock = manager.Lock()
pool = multiprocessing.Pool(16)
with manager:
for _ in tqdm(range(0, 1000)):
result = pool.apply_async(fuck, args=(_,))
print(result)
def test3():
dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
ll = list(dic.values())
ten = torch.Tensor(ll)
t = ten.unsqueeze(1)
t = t.unsqueeze(2)
y = t.repeat(1, 742, 742)
print(ten)
print(y)
print(torch.isfinite(ten))
print(torch.count_nonzero(y).item())
def test4():
one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
print(torch.count_nonzero(one_bool_tensor).item())
def test5():
ten1 = torch.tensor([[1, 2, 3],
[7, 8, 9]])
ten2 = torch.tensor([[4, 5, 6],
[11, 12, 15]])
result = ten1 * ten2
r = torch.sum(result, 1)
print('\n')
print(result)
print(r)
def test6():
table_tensor = torch.tensor([[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]],
[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]]])
t = torch.tensor([[1., 2., 3.],
[4., 5., 6.]])
norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
print('\n')
print(norm1)
print(norm2)
print(t.shape)
def test7():
iterations = 1
filename_list = os.listdir(er_output_dir)
if len(filename_list) > 0:
for _ in filename_list:
if _.startswith('eval_result'):
iterations = int(_[12:13]) + 1
print(iterations)
def test8():
with open(hpo_output_dir + "configspace.json", 'r') as load_f:
dict_configspace = json.load(load_f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
def test9():
df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True)
df.to_csv(r'./datasets/s.csv')
d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1')
print(1)
def test10():
rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\ml_er\output\predictions.csv',
encoding='ISO-8859-1')
print(1)
rtable.columns = ["id", "title", "authors", "venue", "year"]
rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv',
sep=',', index=False, header=True, quoting=1)
def test11():
values = {
'block_attr': 'class',
'confidence_thresh': 0.2717823249253852,
'ml_blocker': 'attr_equiv',
'ml_matcher': 'ln',
'similarity_thresh': 0.20681820299103484,
'support_thresh': 129,
}
with open(hpo_output_dir + "incumbent.json", "w") as f:
json.dump(values, f, indent=4)
def test12():
with open(hpo_output_dir + "incumbent.json", 'r') as f:
dic = json.load(f)
for _ in dic.keys():
print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}')
def test13():
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
f = []
for _ in datasets_list:
f.append(outcome_dir + rf'\{_}' + configs_dir)
print(f)
Loading…
Cancel
Save