固定MD挖掘的阈值,使用训练集挖掘MD

使用GPU穷举后采样挖掘法
MD-metrics-HPO
HuangJintao 1 year ago
parent cf278b91db
commit d9bd56ea95

@ -1,55 +0,0 @@
import os
import pyecharts
from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.globals import ThemeType
if __name__ == '__main__':
dir_path = r'E:\Data\Research\Outcome\Walmart-Amazon_dirty'
filename_list = os.listdir(dir_path)
iter_list = []
precision = []
recall = []
f1 = []
interpretability = []
performance = []
for _ in filename_list:
if _.startswith('eval_result'):
it = int(_[12:13])
iter_list.append(str(it))
with open(dir_path + '\\' + _, 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
if line.startswith('Precision'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
precision.append(value)
elif line.startswith('Recall'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
recall.append(value)
elif line.startswith('F1'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
f1.append(value)
elif line.startswith('interpretability'):
lt = line.split(':')
value = float(lt[1])
interpretability.append(value)
elif line.startswith('performance'):
lt = line.split(':')
value = float(lt[1])
performance.append(value)
line = (
Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
.add_xaxis(iter_list)
.add_yaxis('Precision', precision)
.add_yaxis('Recall', recall)
.add_yaxis('F1', f1)
.add_yaxis('Interpretability', interpretability)
.add_yaxis('Performance', performance)
.set_global_opts(title_opts=opts.TitleOpts(title=dir_path.split('\\')[-1]))
)
line.render(dir_path + '\\' + "line.html")

@ -0,0 +1,54 @@
import os
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
for _ in datasets_list:
path = outcome_dir + rf'\{_}' + configs_dir
statistics_files = os.listdir(path)
length = 0
for file in statistics_files:
if file.startswith('predictions'):
preds = pd.read_csv(path + rf'\{file}', encoding='ISO-8859-1')
preds = preds[['predicted', 'confidence']]
preds = preds.astype(float)
preds = preds[preds['predicted'] == 1.0]
length = len(preds)
li = []
zeros = len(preds[preds['confidence'] == 0])
dot_02 = len(preds[(preds['confidence'] > 0) & (preds['confidence'] <= 0.2)])
dot_24 = len(preds[(preds['confidence'] > 0.2) & (preds['confidence'] <= 0.4)])
dot_46 = len(preds[(preds['confidence'] > 0.4) & (preds['confidence'] <= 0.6)])
dot_68 = len(preds[(preds['confidence'] > 0.6) & (preds['confidence'] <= 0.8)])
dot_80 = len(preds[(preds['confidence'] > 0.8) & (preds['confidence'] <= 1.0)])
for number in [zeros, dot_02, dot_24, dot_46, dot_68, dot_80]:
li.append(round(number * 100 / length, ndigits=3))
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.WALDEN))
.add_xaxis(['conf=0', '0<conf≤0.2', '0.2<conf≤0.4', '0.4<conf≤0.6', '0.6<conf≤0.8', '0.8<conf≤1'])
.add_yaxis(_, li, category_gap=2)
.set_global_opts(
yaxis_opts=opts.AxisOpts(
name="Proportion",
type_="value",
min_=0,
max_=100,
position="left",
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts()
),
axislabel_opts=opts.LabelOpts(formatter="{value}%"),
),
title_opts=opts.TitleOpts(title="Confidence Histogram"),
xaxis_opts=opts.AxisOpts(name="Intervals")
)
.render(path + r"\confidence_histogram.html")
)

@ -0,0 +1,44 @@
import os
import pyecharts
from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
f1 = []
interpretability = []
for _ in datasets_list:
path = outcome_dir + rf'\{_}' + configs_dir
statistics_files = os.listdir(path)
for i in statistics_files:
if i.startswith('eval_result'):
with open(path + rf'\{i}', 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
if line.startswith('F1'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
f1.append(round(value, ndigits=3))
elif line.startswith('interpretability'):
lt = line.split(':')
value = float(lt[1])
interpretability.append(round(value, ndigits=3))
c = (
Bar()
.add_xaxis(
datasets_list
)
.add_yaxis('F1', f1)
.add_yaxis('Interpretability', interpretability)
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
title_opts=opts.TitleOpts(title="各数据集F1值与预测结果可解释比例", subtitle="可解释性比重0.5"),
)
.render("output/F1_Inter_bars.html")
)

@ -12,7 +12,6 @@ from ml_er.ml_entity_resolver import er_process
class Classifier: class Classifier:
@property @property
def configspace(self) -> ConfigurationSpace: def configspace(self) -> ConfigurationSpace:
# Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace(seed=0) cs = ConfigurationSpace(seed=0)
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
selected_attrs = ltable.columns.values.tolist() selected_attrs = ltable.columns.values.tolist()
@ -22,12 +21,8 @@ class Classifier:
block_attr = Categorical("block_attr", block_attr_items) block_attr = Categorical("block_attr", block_attr_items)
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf") ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2)
support_thresh = Integer("support_thresh", (1, 5), default=1)
confidence_thresh = Float("confidence_thresh", (0.25, 0.5), default=0.25)
cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh, cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker])
support_thresh, confidence_thresh])
return cs return cs
def train(self, config: Configuration, seed: int = 0) -> float: def train(self, config: Configuration, seed: int = 0) -> float:
@ -47,8 +42,7 @@ def ml_er_hpo():
scenario = Scenario( scenario = Scenario(
cs, cs,
deterministic=True, deterministic=True,
n_trials=50, # We want to run max 50 trials (combination of config and seed) n_trials=12, # We want to run max 50 trials (combination of config and seed)
walltime_limit=28800, # Max time limit in seconds (14400s = 4h)
n_workers=1 n_workers=1
) )

@ -6,7 +6,7 @@ import copy
import torch import torch
from ConfigSpace import Configuration from ConfigSpace import Configuration
from tqdm import tqdm from tqdm import tqdm
from settings import model from settings import model, similarity_threshold, support_threshold, confidence_threshold
def is_minimal(md, md_list, target_col): def is_minimal(md, md_list, target_col):
@ -36,11 +36,7 @@ def is_minimal(md, md_list, target_col):
return minimal return minimal
def pairs_inference(path, target_col, conf: Configuration): def pairs_inference(path, target_col):
simt = conf["similarity_thresh"]
# simt = round(simt, ndigits=3)
supt = conf["support_thresh"]
cont = conf["confidence_thresh"]
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1') data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True) data.fillna("", inplace=True)
data = data.astype(str) data = data.astype(str)
@ -109,12 +105,12 @@ def pairs_inference(path, target_col, conf: Configuration):
for vio_md in violated_mds: for vio_md in violated_mds:
vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index) vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
if vio_md_support >= supt: if vio_md_support >= support_threshold:
for col in cols_but_target: for col in cols_but_target:
if sims[col] < 1.0: if sims[col] < 1.0:
spec_l_md = copy.deepcopy(vio_md) spec_l_md = copy.deepcopy(vio_md)
if sims[col] < simt: if sims[col] < similarity_threshold:
spec_l_md[col] = simt spec_l_md[col] = similarity_threshold
else: else:
if sims[col] + 0.01 <= 1.0: if sims[col] + 0.01 <= 1.0:
spec_l_md[col] = sims[col] + 0.01 spec_l_md[col] = sims[col] + 0.01
@ -134,7 +130,7 @@ def pairs_inference(path, target_col, conf: Configuration):
if len(minimal_vio) > 0: if len(minimal_vio) > 0:
for md in minimal_vio[:]: for md in minimal_vio[:]:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
if support >= supt and confidence >= cont: if support >= support_threshold and confidence >= confidence_threshold:
minimal_vio.append((md, support, confidence)) minimal_vio.append((md, support, confidence))
minimal_vio.remove(md) minimal_vio.remove(md)
@ -148,7 +144,7 @@ def pairs_inference(path, target_col, conf: Configuration):
# 去除support小于阈值MD # 去除support小于阈值MD
for _ in md_list[:]: for _ in md_list[:]:
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index) support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
if support >= supt and confidence >= cont: if support >= support_threshold and confidence >= confidence_threshold:
md_list.append((_, support, confidence)) md_list.append((_, support, confidence))
md_list.remove(_) md_list.remove(_)
# 去除不minimal的MD # 去除不minimal的MD
@ -170,7 +166,6 @@ def pairs_inference(path, target_col, conf: Configuration):
result.extend(minimal_vio) result.extend(minimal_vio)
result.sort(key=operator.itemgetter(2), reverse=True) result.sort(key=operator.itemgetter(2), reverse=True)
print(f'\033[33mList Length: {len(result)}\033[0m') print(f'\033[33mList Length: {len(result)}\033[0m')
print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m')
return result return result

@ -0,0 +1,138 @@
import math
import operator
import random
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
sample_number = 100000
step_length = 0.01
def get_metrics(md_tensor, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
# md_tensor = list(current_md.values())
# md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = conf_tensor.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence
def build_cartesian(width, target_index):
all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
num=math.ceil((1-similarity_threshold)/step_length) + 1)
all_values_array = np.round(all_values_array, 4)
all_values_tensor = torch.tensor(all_values_array, device='cuda')
all_values_tensor = all_values_tensor.float()
all_values_tensor = torch.round(all_values_tensor, decimals=4)
tensors_for_cartesian = []
for i in range(0, width):
if i == target_index:
t = torch.tensor([1.0], device='cuda')
tensors_for_cartesian.append(t)
else:
tensors_for_cartesian.append(all_values_tensor)
result = torch.cartesian_prod(*tensors_for_cartesian)
return result
def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
cartesian_product = cartesian_product.unsqueeze(2)
cartesian_product = cartesian_product.unsqueeze(3)
cartesian_product = cartesian_product.repeat(1, 1, length, length)
def discover(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
# 除了目标列外所有列的索引
columns_indices = [_ for _ in range(0, width) if _ != target_index]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor.float()
sim_tensor = torch.round(sim_tensor, decimals=4)
# 小于6列的可以尝试做笛卡尔积大于6列可能指数爆炸
if width <= 6:
# 列出除目标列以外所有列的所有取值做笛卡尔积结果为所有可能MD取值
cartesian = build_cartesian(width, target_index)
# 抽取sample_number / (width - 1)条MD不含-1
if cartesian.shape[0] > sample_number / (width - 1):
index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
cartesian = torch.index_select(cartesian, 0, index)
else:
# 随机生成sample_number / (width - 1)条MD使用randint先转化为int再除成小数不含-1
cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
(math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
cartesian = cartesian / 100
# 生成一列相似度为1的目标列插入目标列所在位置
ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
cartesian = torch.round(cartesian, decimals=4)
# 此tensor将与其他置为-1的tensor拼接
joint_md_tensor = cartesian.clone()
# 随机将1列2列……置为-1
for i in range(width - 2):
index_list_format = []
for j in range(cartesian.shape[0]):
# 对每条MD随机选择将要置为-1的列索引
index_list_format.append(random.sample(columns_indices, i + 1))
index = torch.tensor(index_list_format, device='cuda')
# 随机调整为-1后的MD集合
modified_cartesian = cartesian.scatter(1, index, -1)
joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
md_list = []
# get_metric_tensor(cartesian, data, sim_tensor, target_index)
for _ in tqdm(range(joint_md_tensor.shape[0])):
s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
if s >= support_threshold and c >= confidence_threshold:
md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
md_dict_format = {}
for k in range(0, width):
md_dict_format[columns[k]] = md_list_format[k]
md_list.append((md_dict_format, s, c))
md_list.sort(key=operator.itemgetter(2), reverse=True)
return md_list

@ -1,6 +1,7 @@
from ConfigSpace import Configuration from ConfigSpace import Configuration
from md_discovery.discovery_executor import pairs_inference from md_discovery.discovery_executor import pairs_inference
from md_discovery.discovery_executor_gpu import discover
from settings import * from settings import *
# # 若不输出support和confidence使用以下两块代码 # # 若不输出support和confidence使用以下两块代码
@ -17,16 +18,12 @@ from settings import *
# f.write(str(_) + '\n') # f.write(str(_) + '\n')
def md_discover(config: Configuration): def md_discover(config: Configuration, source_path, target_path):
t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值 # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值
mds_list = pairs_inference(t_single_tuple_path, target_attr, config) # mds_list = pairs_inference(source_path, target_attr)
mds_list = discover(source_path, target_attr)
# 将列表1写入本地路径需自己修改 with open(target_path, 'w') as f:
mds_path = md_output_dir + "mds.txt"
with open(mds_path, 'w') as f:
for _ in mds_list: for _ in mds_list:
f.write('Target:'+str(target_attr) + '\t') f.write('Target:'+str(target_attr) + '\t')
f.write(str(_)) f.write(str(_))

@ -1,6 +1,7 @@
import json import json
import os import os
import sys import sys
import time
import ConfigSpace import ConfigSpace
import pandas import pandas
@ -12,28 +13,24 @@ import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd import pandas as pd
import six import six
from ConfigSpace import Configuration from ConfigSpace import Configuration
from tqdm import tqdm
from md_discovery.md_discover import md_discover from md_discovery.md_discover import md_discover
from settings import * from settings import *
def process_prediction_for_md_discovery(pred: pd.DataFrame, def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"):
t_single_tuple_path: str = er_output_dir + "t_single_tuple.csv"): df = train[train['gold'] == 1]
# 提取预测表中真阳和假阴部分 # 元组对左右ID调整一致
tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
# 拼成一张表
df = pd.concat([tp, fn])
# 将真阳/假阴表中左右ID调整一致
for index, row in df.iterrows(): for index, row in df.iterrows():
df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
pred_columns = pred.columns.values.tolist() train_columns = train.columns.values.tolist()
l_columns = [] l_columns = []
r_columns = [] r_columns = []
cols = [] cols = []
# 将预测表中左表和右表字段名分别加入两个列表 # 左表和右表字段名分别加入两个列表
for _ in pred_columns: for _ in train_columns:
if _.startswith('ltable'): if _.startswith('ltable'):
l_columns.append(_) l_columns.append(_)
elif _.startswith('rtable'): elif _.startswith('rtable'):
@ -47,6 +44,7 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame,
ldf.columns = cols ldf.columns = cols
rdf.columns = cols rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf]) t_single_tuple = pd.concat([ldf, rdf])
t_single_tuple = t_single_tuple.reset_index(drop=True)
t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1) t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
@ -148,6 +146,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
def er_process(config: Configuration): def er_process(config: Configuration):
start = time.time()
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id) cm.set_key(ltable, ltable_id)
# ltable.fillna("", inplace=True) # ltable.fillna("", inplace=True)
@ -295,27 +294,42 @@ def er_process(config: Configuration):
predictions_attrs.extend(attrs_with_r_prefix) predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted']) predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs] predictions = predictions[predictions_attrs]
process_prediction_for_md_discovery(predictions) train_attrs = predictions_attrs[:]
train_attrs.remove('predicted')
train_set = train_set[train_attrs]
prepare_file_for_md_discovery(train_set)
predictions = predictions.reset_index(drop=True) predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str) predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0 predictions['confidence'] = 0
md_discover(config) md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt")
md_paths = [md_output_dir + 'mds.txt'] md_paths = [md_output_dir + 'mds.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match epl_match = 0 # 可解释预测match
unexplainable = pd.DataFrame()
if len(md_list) > 0: if len(md_list) > 0:
for row in predictions.itertuples(): for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict) x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1): if x > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x predictions.loc[row[0], 'confidence'] = x
epl_match += 1 epl_match += 1
# else:
# series = pd.Series(row)
# unexplainable = unexplainable._append(series, ignore_index=True)
# unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True)
# unexplainable.columns = predictions_attrs
# unexplainable = unexplainable[train_attrs]
# if len(unexplainable[unexplainable['gold'] == str(1)]) > 0:
# prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv')
# md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt")
df = predictions[predictions['predicted'] == str(1)] df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性 interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability indicators['interpretability'] = interpretability
if (indicators["block_recall"] < 0.8) and (indicators["block_recall"] < indicators["recall"]): if indicators["block_recall"] < indicators["recall"]:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / ( f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
indicators["precision"] + indicators["block_recall"]) indicators["precision"] + indicators["block_recall"])
else: else:
@ -326,6 +340,7 @@ def er_process(config: Configuration):
print(indicators) print(indicators)
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True) predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
################################################################################################################ ################################################################################################################
print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m')
return indicators return indicators

@ -1,10 +1,10 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Amazon.csv' ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\GoogleProducts.csv' rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Mapping.csv' mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\matches.csv'
mapping_lid = 'idAmazon' # mapping表中左表id名 mapping_lid = 'idDBLP' # mapping表中左表id名
mapping_rid = 'idGoogleBase' # mapping表中右表id名 mapping_rid = 'idScholar' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称 ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称 rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段 target_attr = 'id' # 进行md挖掘时的目标字段
@ -12,9 +12,9 @@ target_attr = 'id' # 进行md挖掘时的目标字段
model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens') model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
interpre_weight = 0.5 # 可解释性权重 interpre_weight = 0.5 # 可解释性权重
# similarity_threshold = 0.2 similarity_threshold = 0.1
# support_threshold = 100 support_threshold = 1
# confidence_threshold = 0.4 confidence_threshold = 0.25
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\' er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\' md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'

@ -147,3 +147,13 @@ def test12():
dic = json.load(f) dic = json.load(f)
for _ in dic.keys(): for _ in dic.keys():
print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}') print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}')
def test13():
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
f = []
for _ in datasets_list:
f.append(outcome_dir + rf'\{_}' + configs_dir)
print(f)

Loading…
Cancel
Save