MD-metrics-HPO
HuangJintao 7 months ago
parent 9b06ce3840
commit c1e07eabb6

2
.gitignore vendored

@ -6,5 +6,3 @@ tfile.py
table_embedding.py table_embedding.py
set_none.py set_none.py
generate_matches.py generate_matches.py
ml_er/fuck.py

@ -39,6 +39,7 @@ if __name__ == '__main__':
.set_global_opts( .set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
title_opts=opts.TitleOpts(title="各数据集F1值与预测结果可解释比例", subtitle="可解释性比重0.5"), title_opts=opts.TitleOpts(title="各数据集F1值与预测结果可解释比例", subtitle="可解释性比重0.5"),
# yaxis_opts=opts.AxisOpts(min_=0.95) # 设置y轴起始点
) )
.render("output/F1_Inter_bars.html") .render("output/F1_Inter_bars.html")
) )

@ -1,21 +0,0 @@
import csv
import pandas as pd
import json
import sentence_transformers.util
import torch
from sentence_transformers import SentenceTransformer
from torch import nn
if __name__ == '__main__':
directory = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM_dirty'
train = pd.read_csv(directory + r'\train.csv', encoding='ISO-8859-1')
valid = pd.read_csv(directory + r'\valid.csv', encoding='ISO-8859-1')
test = pd.read_csv(directory + r'\test.csv', encoding='ISO-8859-1')
train = train[train['label'] == 1]
valid = valid[valid['label'] == 1]
test = test[test['label'] == 1]
matches = pd.concat([train, valid, test])
matches.drop(columns=['label'], inplace=True)
matches = matches.sort_values(by='ltable_id')
matches.to_csv(directory + r'\matches.csv', sep=',', index=False, header=True)

@ -2,7 +2,7 @@ import json
import pickle import pickle
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition, EqualsCondition from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
from ConfigSpace.read_and_write import json as csj from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd import pandas as pd
@ -27,7 +27,7 @@ class Classifier:
svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf') svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')
svm_C = Integer('svm_C', (1, 100), default=1) svm_C = Integer('svm_C', (1, 100), default=1)
svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale') svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')
svm_degree = Integer('svm_degree', (1, 5), default=3) svm_degree = Integer('svm_degree', (1, 3), default=3)
svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0) svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)
dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best') dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')
@ -43,15 +43,20 @@ class Classifier:
active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt") active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")
active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt") active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")
active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm") active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")
active_svm_gamma = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
active_svm_degree = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
active_svm_constant = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm") active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")
active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
cs.add_conditions([active_svm_C, active_svm_constant, active_svm_degree, active_svm_gamma, active_svm_kernel, active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
active_dt_splitter, active_rf_n_estimators, active_dt_max_features, active_rf_max_features, active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
active_tree_max_depth, active_tree_criterion])
active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"])
active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly")
active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"])
cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators,
active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion,
AndConjunction(active_svm_gamma1, active_svm_gamma2),
AndConjunction(active_svm_degree1, active_svm_degree2),
AndConjunction(active_svm_constant1, active_svm_constant2)])
return cs return cs
def train(self, config: Configuration, seed: int = 0) -> float: def train(self, config: Configuration, seed: int = 0) -> float:
@ -75,7 +80,7 @@ def ml_er_hpo():
cs, cs,
crash_cost=1.0, crash_cost=1.0,
deterministic=True, deterministic=True,
n_trials=50, n_trials=20,
n_workers=1 n_workers=1
) )

@ -1,3 +1,4 @@
import itertools
import random import random
import operator import operator
from operator import itemgetter from operator import itemgetter
@ -91,6 +92,9 @@ def mining(train: pd.DataFrame):
# result_list.sort(key=itemgetter(2), reverse=True) # result_list.sort(key=itemgetter(2), reverse=True)
# 按confidence->support的优先级排序 # 按confidence->support的优先级排序
result_list.sort(key=itemgetter(2, 1), reverse=True) result_list.sort(key=itemgetter(2, 1), reverse=True)
result_list = merge_mds(result_list)
result_list.sort(key=itemgetter(2, 1), reverse=True)
# 保存到本地
mds_to_txt(result_list) mds_to_txt(result_list)
return result_list return result_list
@ -210,3 +214,42 @@ def mds_to_txt(result_list_):
for _ in result_list_: for _ in result_list_:
f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}') f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}')
f.write('\n') f.write('\n')
# 合并一些MD
def merge_mds(md_list_):
# 创建一个空字典用于分组
grouped_md_tuples = {}
# 遍历三元组并对它们进行分组
for md_tuple in md_list_:
# 提取Support和Confidence的值作为字典的键
key = (md_tuple[1], md_tuple[2])
# 检查键是否已经存在于分组字典中
if key in grouped_md_tuples:
# 如果存在,将三元组添加到对应的列表中
grouped_md_tuples[key].append(md_tuple)
else:
# 如果不存在,创建一个新的键值对
grouped_md_tuples[key] = [md_tuple]
# 不要键只要值
# 一个二级列表, 每个子列表中MD tuple的support和confidence一样
grouped_md_tuples = list(grouped_md_tuples.values())
for same_sc_list in grouped_md_tuples:
# 创建一个索引列表,用于标记需要删除的元组
indices_to_remove = []
# 获取元组列表的长度
length = len(same_sc_list)
# 遍历元组列表,进行比较和删除操作
for i in range(length):
for j in range(length):
# 比较两个元组的字典值
if i != j and all(same_sc_list[i][0][key_] >= same_sc_list[j][0][key_] for key_ in same_sc_list[i][0]):
# 如果同组内一个MD的所有相似度阈值都大于等于另一个MD, 则前者可以删除
indices_to_remove.append(i)
break # 由于列表大小会变化,跳出内层循环
# 根据索引列表逆序删除元组,以避免在删除时改变列表大小
for index in sorted(indices_to_remove, reverse=True):
del same_sc_list[index]
# 二级列表转一级列表
return list(itertools.chain.from_iterable(grouped_md_tuples))

@ -34,7 +34,7 @@ def blocking_mining():
blocker = em.OverlapBlocker() blocker = em.OverlapBlocker()
candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True, candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=-1, l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
overlap_size=1, show_progress=False) overlap_size=1, show_progress=False)
candidate['gold'] = 0 candidate['gold'] = 0
candidate = candidate.reset_index(drop=True) candidate = candidate.reset_index(drop=True)
@ -52,6 +52,7 @@ def blocking_mining():
candidate.loc[match_rows_mask, 'gold'] = 1 candidate.loc[match_rows_mask, 'gold'] = 1
candidate.fillna(value="", inplace=True) candidate.fillna(value="", inplace=True)
# negative样本太多, 采样三倍于positive样本量
candidate_mismatch = candidate[candidate['gold'] == 0] candidate_mismatch = candidate[candidate['gold'] == 0]
candidate_match = candidate[candidate['gold'] == 1] candidate_match = candidate[candidate['gold'] == 1]
candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match)) candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))

@ -1,27 +0,0 @@
import numpy as np
import pandas as pd
if __name__ == '__main__':
lt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv',
encoding='ISO-8859-1', sep=',')
rt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv',
encoding='ISO-8859-1', sep=',')
mapping = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
encoding='ISO-8859-1', sep=',')
# 删除列
# lt.drop(columns=['numberOfSeasons', 'numberOfEpisodes', 'birthDate', 'last_air_date', 'release_year', 'runtime',
# 'genre_list', 'origin_country'], inplace=True)
# rt.drop(columns=['job'], inplace=True)
# 列名调整
# rt = rt.rename(columns={'birthDate': 'birthYear', 'release_year': 'startYear',
# 'last_air_date': 'endYear', 'runtime': 'runtimeMinutes'})
# 列顺序调整
# rt = rt[['id', 'title', 'name', 'episodeNumber', 'seasonNumber', 'birthYear', 'endYear', 'startYear', 'genre_list',
# 'runtimeMinutes']]
# lt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv', sep=',', index=False, header=True, quoting=1)
# rt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv', sep=',', index=False, header=True, quoting=1)
# mapping.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
# sep=',', index=False, header=True, quoting=1)
print(1)

@ -1,8 +1,8 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv' ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv' rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv' mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
mapping_lid = 'idAbt' # mapping表中左表id名 mapping_lid = 'idAbt' # mapping表中左表id名
mapping_rid = 'idBuy' # mapping表中右表id名 mapping_rid = 'idBuy' # mapping表中右表id名
ltable_block_attr = 'name' ltable_block_attr = 'name'
@ -13,12 +13,12 @@ target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 # lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2') model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
interpre_weight = 1 # 可解释性权重 interpre_weight = 0 # 可解释性权重
similarity_threshold = 0.1 similarity_threshold = 0.1
support_threshold = 1 support_threshold = 1
confidence_threshold = 0.75 confidence_threshold = 0.75
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\' hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'

@ -1,13 +0,0 @@
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
if __name__ == '__main__':
path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
encoding = tokenizer(table, return_tensors="pt")
embedding = tokenizer.encode(table, return_tensors="pt")
print(1)

@ -1,159 +0,0 @@
import json
import multiprocessing
import os
import time
import ConfigSpace
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from ConfigSpace.read_and_write import json as csj
from md_discovery import discovery_executor
from settings import er_output_dir, hpo_output_dir
def fuck(i):
i = i * i + 1
def test1():
li = [[[6, 6, 2],
[2, 4, 6],
[2, 4, 7],
[3, 6, 4]],
[[6, 2, 7],
[3, 2, 4],
[5, 3, 5],
[6, 2, 4]],
[[7, 2, 2],
[6, 3, 2],
[6, 4, 3],
[6, 5, 6]]]
tensor = torch.Tensor(li)
norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
print(norm_tensor, '\n')
sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
print(sim_ten / 2 + 0.5, '\n')
print(sim_ten.size())
def test2():
multiprocessing.set_start_method("spawn")
manager = multiprocessing.Manager()
lock = manager.Lock()
pool = multiprocessing.Pool(16)
with manager:
for _ in tqdm(range(0, 1000)):
result = pool.apply_async(fuck, args=(_,))
print(result)
def test3():
dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
ll = list(dic.values())
ten = torch.Tensor(ll)
t = ten.unsqueeze(1)
t = t.unsqueeze(2)
y = t.repeat(1, 742, 742)
print(ten)
print(y)
print(torch.isfinite(ten))
print(torch.count_nonzero(y).item())
def test4():
one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
print(torch.count_nonzero(one_bool_tensor).item())
def test5():
ten1 = torch.tensor([[1, 2, 3],
[7, 8, 9]])
ten2 = torch.tensor([[4, 5, 6],
[11, 12, 15]])
result = ten1 * ten2
r = torch.sum(result, 1)
print('\n')
print(result)
print(r)
def test6():
table_tensor = torch.tensor([[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]],
[[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]]])
t = torch.tensor([[1., 2., 3.],
[4., 5., 6.]])
norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
print('\n')
print(norm1)
print(norm2)
print(t.shape)
def test7():
iterations = 1
filename_list = os.listdir(er_output_dir)
if len(filename_list) > 0:
for _ in filename_list:
if _.startswith('eval_result'):
iterations = int(_[12:13]) + 1
print(iterations)
def test8():
with open(hpo_output_dir + "configspace.json", 'r') as load_f:
dict_configspace = json.load(load_f)
str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace)
def test9():
df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True)
df.to_csv(r'./datasets/s.csv')
d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1')
print(1)
def test10():
rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\ml_er\output\predictions.csv',
encoding='ISO-8859-1')
print(1)
rtable.columns = ["id", "title", "authors", "venue", "year"]
rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv',
sep=',', index=False, header=True, quoting=1)
def test11():
values = {
'block_attr': 'class',
'confidence_thresh': 0.2717823249253852,
'ml_blocker': 'attr_equiv',
'ml_matcher': 'ln',
'similarity_thresh': 0.20681820299103484,
'support_thresh': 129,
}
with open(hpo_output_dir + "incumbent.json", "w") as f:
json.dump(values, f, indent=4)
def test12():
with open(hpo_output_dir + "incumbent.json", 'r') as f:
dic = json.load(f)
for _ in dic.keys():
print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}')
def test13():
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
f = []
for _ in datasets_list:
f.append(outcome_dir + rf'\{_}' + configs_dir)
print(f)
Loading…
Cancel
Save