MD-metrics-HPO
HuangJintao 11 months ago
parent 3d10e93bdf
commit 4f61d93a30

@ -7,11 +7,13 @@ from pyecharts.globals import ThemeType
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
inter_list = ['0', '0.5', '0.7', '0.9', '1']
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-'
datasets_list = os.listdir(outcome_dir)
for _ in datasets_list:
path = outcome_dir + rf'\{_}' + configs_dir
for inter in inter_list:
path = outcome_dir + rf'\{_}' + configs_dir + inter
statistics_files = os.listdir(path)
length = 0
for file in statistics_files:

@ -0,0 +1,51 @@
import os
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.faker import Faker
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
datasets_list = os.listdir(outcome_dir)
inter_list = ['0', '0.5', '0.7', '0.9', '1']
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-'
for _ in datasets_list:
block_recall_list = []
f1 = []
interpretability = []
for inter in inter_list:
path = outcome_dir + rf'\{_}' + configs_dir + inter
statistics_files = os.listdir(path)
for i in statistics_files:
if i.startswith('eval_result'):
with open(path + rf'\{i}', 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
if line.startswith('block_recall'):
lt = line.split(':')
value = float(lt[1])
block_recall_list.append(round(value, ndigits=3))
elif line.startswith('F1'):
lt = line.split(' ')
value = float(lt[2].replace('%', '')) / 100
f1.append(round(value, ndigits=3))
elif line.startswith('interpretability'):
lt = line.split(':')
value = float(lt[1])
interpretability.append(round(value, ndigits=3))
interpretability[0] = 0
c = (
Line()
.add_xaxis(inter_list)
.add_yaxis("Block Recall", block_recall_list)
.add_yaxis("F1", f1)
.add_yaxis("Interpretability", interpretability)
.set_global_opts(
title_opts=opts.TitleOpts(title=_),
# yaxis_opts=opts.AxisOpts(name="Block Recall", name_location="middle", name_gap=15, name_rotate=0),
xaxis_opts=opts.AxisOpts(name="Interpretability Weight", name_location="middle", name_gap=25)
)
.render(outcome_dir + rf'\{_}\inter_weight_lines.html')
)

@ -1,17 +1,17 @@
from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
mapping_lid = 'idAbt' # mapping表中左表id名
mapping_rid = 'idBuy' # mapping表中右表id名
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv'
mapping_lid = 'ltable_id' # mapping表中左表id名
mapping_rid = 'rtable_id' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
interpre_weight = 0.9 # 可解释性权重
interpre_weight = 1 # 可解释性权重
similarity_threshold = 0.1
support_threshold = 1
confidence_threshold = 0.25

@ -1,7 +1,13 @@
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
if __name__ == '__main__':
model = SentenceTransformer('E:\\Data\\Research\\Models\\tapex-base-finetuned-wikisql')
sentences = ['公积金转入深圳', '公积金转出深圳']
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
encoding = tokenizer(table, return_tensors="pt")
embedding = tokenizer.encode(table, return_tensors="pt")
print(1)

Loading…
Cancel
Save