MD-metrics-HPO
HuangJintao 11 months ago
parent 3d10e93bdf
commit 4f61d93a30

@ -7,48 +7,50 @@ from pyecharts.globals import ThemeType
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-0.5'
inter_list = ['0', '0.5', '0.7', '0.9', '1']
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-'
datasets_list = os.listdir(outcome_dir)
for _ in datasets_list:
path = outcome_dir + rf'\{_}' + configs_dir
statistics_files = os.listdir(path)
length = 0
for file in statistics_files:
if file.startswith('predictions'):
preds = pd.read_csv(path + rf'\{file}', encoding='ISO-8859-1')
preds = preds[['predicted', 'confidence']]
preds = preds.astype(float)
preds = preds[preds['predicted'] == 1.0]
length = len(preds)
li = []
zeros = len(preds[preds['confidence'] == 0])
dot_02 = len(preds[(preds['confidence'] > 0) & (preds['confidence'] <= 0.2)])
dot_24 = len(preds[(preds['confidence'] > 0.2) & (preds['confidence'] <= 0.4)])
dot_46 = len(preds[(preds['confidence'] > 0.4) & (preds['confidence'] <= 0.6)])
dot_68 = len(preds[(preds['confidence'] > 0.6) & (preds['confidence'] <= 0.8)])
dot_80 = len(preds[(preds['confidence'] > 0.8) & (preds['confidence'] <= 1.0)])
for number in [zeros, dot_02, dot_24, dot_46, dot_68, dot_80]:
li.append(round(number * 100 / length, ndigits=3))
for inter in inter_list:
path = outcome_dir + rf'\{_}' + configs_dir + inter
statistics_files = os.listdir(path)
length = 0
for file in statistics_files:
if file.startswith('predictions'):
preds = pd.read_csv(path + rf'\{file}', encoding='ISO-8859-1')
preds = preds[['predicted', 'confidence']]
preds = preds.astype(float)
preds = preds[preds['predicted'] == 1.0]
length = len(preds)
li = []
zeros = len(preds[preds['confidence'] == 0])
dot_02 = len(preds[(preds['confidence'] > 0) & (preds['confidence'] <= 0.2)])
dot_24 = len(preds[(preds['confidence'] > 0.2) & (preds['confidence'] <= 0.4)])
dot_46 = len(preds[(preds['confidence'] > 0.4) & (preds['confidence'] <= 0.6)])
dot_68 = len(preds[(preds['confidence'] > 0.6) & (preds['confidence'] <= 0.8)])
dot_80 = len(preds[(preds['confidence'] > 0.8) & (preds['confidence'] <= 1.0)])
for number in [zeros, dot_02, dot_24, dot_46, dot_68, dot_80]:
li.append(round(number * 100 / length, ndigits=3))
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.WALDEN))
.add_xaxis(['conf=0', '0<conf≤0.2', '0.2<conf≤0.4', '0.4<conf≤0.6', '0.6<conf≤0.8', '0.8<conf≤1'])
.add_yaxis(_, li, category_gap=2)
.set_global_opts(
yaxis_opts=opts.AxisOpts(
name="Proportion",
type_="value",
min_=0,
max_=100,
position="left",
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts()
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.WALDEN))
.add_xaxis(['conf=0', '0<conf≤0.2', '0.2<conf≤0.4', '0.4<conf≤0.6', '0.6<conf≤0.8', '0.8<conf≤1'])
.add_yaxis(_, li, category_gap=2)
.set_global_opts(
yaxis_opts=opts.AxisOpts(
name="Proportion",
type_="value",
min_=0,
max_=100,
position="left",
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts()
),
axislabel_opts=opts.LabelOpts(formatter="{value}%"),
),
axislabel_opts=opts.LabelOpts(formatter="{value}%"),
),
title_opts=opts.TitleOpts(title="Confidence Histogram"),
xaxis_opts=opts.AxisOpts(name="Intervals")
title_opts=opts.TitleOpts(title="Confidence Histogram"),
xaxis_opts=opts.AxisOpts(name="Intervals")
)
.render(path + r"\confidence_histogram.html")
)
.render(path + r"\confidence_histogram.html")
)

@ -0,0 +1,51 @@
import os
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.faker import Faker
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
datasets_list = os.listdir(outcome_dir)
inter_list = ['0', '0.5', '0.7', '0.9', '1']
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens+inter-'
for _ in datasets_list:
block_recall_list = []
f1 = []
interpretability = []
for inter in inter_list:
path = outcome_dir + rf'\{_}' + configs_dir + inter
statistics_files = os.listdir(path)
for i in statistics_files:
if i.startswith('eval_result'):
with open(path + rf'\{i}', 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
if line.startswith('block_recall'):
lt = line.split(':')
value = float(lt[1])
block_recall_list.append(round(value, ndigits=3))
elif line.startswith('F1'):
lt = line.split(' ')
value = float(lt[2].replace('%', '')) / 100
f1.append(round(value, ndigits=3))
elif line.startswith('interpretability'):
lt = line.split(':')
value = float(lt[1])
interpretability.append(round(value, ndigits=3))
interpretability[0] = 0
c = (
Line()
.add_xaxis(inter_list)
.add_yaxis("Block Recall", block_recall_list)
.add_yaxis("F1", f1)
.add_yaxis("Interpretability", interpretability)
.set_global_opts(
title_opts=opts.TitleOpts(title=_),
# yaxis_opts=opts.AxisOpts(name="Block Recall", name_location="middle", name_gap=15, name_rotate=0),
xaxis_opts=opts.AxisOpts(name="Interpretability Weight", name_location="middle", name_gap=25)
)
.render(outcome_dir + rf'\{_}\inter_weight_lines.html')
)

@ -1,17 +1,17 @@
from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
mapping_lid = 'idAbt' # mapping表中左表id名
mapping_rid = 'idBuy' # mapping表中右表id名
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv'
mapping_lid = 'ltable_id' # mapping表中左表id名
mapping_rid = 'rtable_id' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
interpre_weight = 0.9 # 可解释性权重
interpre_weight = 1 # 可解释性权重
similarity_threshold = 0.1
support_threshold = 1
confidence_threshold = 0.25

@ -1,7 +1,13 @@
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
if __name__ == '__main__':
model = SentenceTransformer('E:\\Data\\Research\\Models\\tapex-base-finetuned-wikisql')
sentences = ['公积金转入深圳', '公积金转出深圳']
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
encoding = tokenizer(table, return_tensors="pt")
embedding = tokenizer.encode(table, return_tensors="pt")
print(1)

Loading…
Cancel
Save