固定MD挖掘的阈值,使用训练集挖掘MD

使用GPU穷举后采样挖掘法
MD-metrics-HPO
HuangJintao 1 year ago
parent cf278b91db
commit d9bd56ea95

@ -1,55 +0,0 @@
import os
import pyecharts
from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.globals import ThemeType
if __name__ == '__main__':
dir_path = r'E:\Data\Research\Outcome\Walmart-Amazon_dirty'
filename_list = os.listdir(dir_path)
iter_list = []
precision = []
recall = []
f1 = []
interpretability = []
performance = []
for _ in filename_list:
if _.startswith('eval_result'):
it = int(_[12:13])
iter_list.append(str(it))
with open(dir_path + '\\' + _, 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
if line.startswith('Precision'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
precision.append(value)
elif line.startswith('Recall'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
recall.append(value)
elif line.startswith('F1'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
f1.append(value)
elif line.startswith('interpretability'):
lt = line.split(':')
value = float(lt[1])
interpretability.append(value)
elif line.startswith('performance'):
lt = line.split(':')
value = float(lt[1])
performance.append(value)
line = (
Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
.add_xaxis(iter_list)
.add_yaxis('Precision', precision)
.add_yaxis('Recall', recall)
.add_yaxis('F1', f1)
.add_yaxis('Interpretability', interpretability)
.add_yaxis('Performance', performance)
.set_global_opts(title_opts=opts.TitleOpts(title=dir_path.split('\\')[-1]))
)
line.render(dir_path + '\\' + "line.html")

@ -0,0 +1,54 @@
import os
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
for _ in datasets_list:
path = outcome_dir + rf'\{_}' + configs_dir
statistics_files = os.listdir(path)
length = 0
for file in statistics_files:
if file.startswith('predictions'):
preds = pd.read_csv(path + rf'\{file}', encoding='ISO-8859-1')
preds = preds[['predicted', 'confidence']]
preds = preds.astype(float)
preds = preds[preds['predicted'] == 1.0]
length = len(preds)
li = []
zeros = len(preds[preds['confidence'] == 0])
dot_02 = len(preds[(preds['confidence'] > 0) & (preds['confidence'] <= 0.2)])
dot_24 = len(preds[(preds['confidence'] > 0.2) & (preds['confidence'] <= 0.4)])
dot_46 = len(preds[(preds['confidence'] > 0.4) & (preds['confidence'] <= 0.6)])
dot_68 = len(preds[(preds['confidence'] > 0.6) & (preds['confidence'] <= 0.8)])
dot_80 = len(preds[(preds['confidence'] > 0.8) & (preds['confidence'] <= 1.0)])
for number in [zeros, dot_02, dot_24, dot_46, dot_68, dot_80]:
li.append(round(number * 100 / length, ndigits=3))
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.WALDEN))
.add_xaxis(['conf=0', '0<conf≤0.2', '0.2<conf≤0.4', '0.4<conf≤0.6', '0.6<conf≤0.8', '0.8<conf≤1'])
.add_yaxis(_, li, category_gap=2)
.set_global_opts(
yaxis_opts=opts.AxisOpts(
name="Proportion",
type_="value",
min_=0,
max_=100,
position="left",
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts()
),
axislabel_opts=opts.LabelOpts(formatter="{value}%"),
),
title_opts=opts.TitleOpts(title="Confidence Histogram"),
xaxis_opts=opts.AxisOpts(name="Intervals")
)
.render(path + r"\confidence_histogram.html")
)

@ -0,0 +1,44 @@
import os
import pyecharts
from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType
if __name__ == '__main__':
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
f1 = []
interpretability = []
for _ in datasets_list:
path = outcome_dir + rf'\{_}' + configs_dir
statistics_files = os.listdir(path)
for i in statistics_files:
if i.startswith('eval_result'):
with open(path + rf'\{i}', 'r') as f:
# 读取每一行的md加入该文件的md列表
for line in f.readlines():
if line.startswith('F1'):
lt = line.split(' ')
value = float(lt[2].replace('%', ''))/100
f1.append(round(value, ndigits=3))
elif line.startswith('interpretability'):
lt = line.split(':')
value = float(lt[1])
interpretability.append(round(value, ndigits=3))
c = (
Bar()
.add_xaxis(
datasets_list
)
.add_yaxis('F1', f1)
.add_yaxis('Interpretability', interpretability)
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
title_opts=opts.TitleOpts(title="各数据集F1值与预测结果可解释比例", subtitle="可解释性比重0.5"),
)
.render("output/F1_Inter_bars.html")
)

@ -12,7 +12,6 @@ from ml_er.ml_entity_resolver import er_process
class Classifier:
@property
def configspace(self) -> ConfigurationSpace:
# Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace(seed=0)
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
selected_attrs = ltable.columns.values.tolist()
@ -22,12 +21,8 @@ class Classifier:
block_attr = Categorical("block_attr", block_attr_items)
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2)
support_thresh = Integer("support_thresh", (1, 5), default=1)
confidence_thresh = Float("confidence_thresh", (0.25, 0.5), default=0.25)
cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh,
support_thresh, confidence_thresh])
cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker])
return cs
def train(self, config: Configuration, seed: int = 0) -> float:
@ -47,8 +42,7 @@ def ml_er_hpo():
scenario = Scenario(
cs,
deterministic=True,
n_trials=50, # We want to run max 50 trials (combination of config and seed)
walltime_limit=28800, # Max time limit in seconds (14400s = 4h)
n_trials=12, # We want to run max 50 trials (combination of config and seed)
n_workers=1
)

@ -6,7 +6,7 @@ import copy
import torch
from ConfigSpace import Configuration
from tqdm import tqdm
from settings import model
from settings import model, similarity_threshold, support_threshold, confidence_threshold
def is_minimal(md, md_list, target_col):
@ -36,11 +36,7 @@ def is_minimal(md, md_list, target_col):
return minimal
def pairs_inference(path, target_col, conf: Configuration):
simt = conf["similarity_thresh"]
# simt = round(simt, ndigits=3)
supt = conf["support_thresh"]
cont = conf["confidence_thresh"]
def pairs_inference(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
@ -109,12 +105,12 @@ def pairs_inference(path, target_col, conf: Configuration):
for vio_md in violated_mds:
vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
if vio_md_support >= supt:
if vio_md_support >= support_threshold:
for col in cols_but_target:
if sims[col] < 1.0:
spec_l_md = copy.deepcopy(vio_md)
if sims[col] < simt:
spec_l_md[col] = simt
if sims[col] < similarity_threshold:
spec_l_md[col] = similarity_threshold
else:
if sims[col] + 0.01 <= 1.0:
spec_l_md[col] = sims[col] + 0.01
@ -134,7 +130,7 @@ def pairs_inference(path, target_col, conf: Configuration):
if len(minimal_vio) > 0:
for md in minimal_vio[:]:
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
if support >= supt and confidence >= cont:
if support >= support_threshold and confidence >= confidence_threshold:
minimal_vio.append((md, support, confidence))
minimal_vio.remove(md)
@ -148,7 +144,7 @@ def pairs_inference(path, target_col, conf: Configuration):
# 去除support小于阈值MD
for _ in md_list[:]:
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
if support >= supt and confidence >= cont:
if support >= support_threshold and confidence >= confidence_threshold:
md_list.append((_, support, confidence))
md_list.remove(_)
# 去除不minimal的MD
@ -170,7 +166,6 @@ def pairs_inference(path, target_col, conf: Configuration):
result.extend(minimal_vio)
result.sort(key=operator.itemgetter(2), reverse=True)
print(f'\033[33mList Length: {len(result)}\033[0m')
print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m')
return result

@ -0,0 +1,138 @@
import math
import operator
import random
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from settings import model, similarity_threshold, support_threshold, confidence_threshold, md_output_dir
sample_number = 100000
step_length = 0.01
def get_metrics(md_tensor, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
# md_tensor = list(current_md.values())
# md_tensor = torch.tensor(md_tensor, device='cuda')
md_tensor_2d = md_tensor.unsqueeze(1)
md_tensor_3d = md_tensor_2d.unsqueeze(2)
md_tensor_3d = md_tensor_3d.repeat(1, length, length)
sim_tensor = torch.round(sim_tensor, decimals=4)
sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
for i in range(0, width):
if i != target_index:
sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int()
support_Naumann = torch.count_nonzero(sup_tensor_int).item()
support_Naumann = (support_Naumann - length) / 2
conf_tensor = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = conf_tensor.int()
support_Fan = torch.count_nonzero(conf_tensor_int).item()
support_Fan = (support_Fan - length) / 2
confidence = support_Fan / support_Naumann if support_Naumann > 0 else 0
return support_Fan, confidence
def build_cartesian(width, target_index):
all_values_array = np.linspace(start=similarity_threshold, stop=1, endpoint=True,
num=math.ceil((1-similarity_threshold)/step_length) + 1)
all_values_array = np.round(all_values_array, 4)
all_values_tensor = torch.tensor(all_values_array, device='cuda')
all_values_tensor = all_values_tensor.float()
all_values_tensor = torch.round(all_values_tensor, decimals=4)
tensors_for_cartesian = []
for i in range(0, width):
if i == target_index:
t = torch.tensor([1.0], device='cuda')
tensors_for_cartesian.append(t)
else:
tensors_for_cartesian.append(all_values_tensor)
result = torch.cartesian_prod(*tensors_for_cartesian)
return result
def get_metric_tensor(cartesian_product, data, sim_tensor, target_index):
length = data.shape[0]
width = data.shape[1]
cartesian_product = cartesian_product.unsqueeze(2)
cartesian_product = cartesian_product.unsqueeze(3)
cartesian_product = cartesian_product.repeat(1, 1, length, length)
def discover(path, target_col):
data = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
data.fillna("", inplace=True)
data = data.astype(str)
columns = data.columns.values.tolist()
target_index = columns.index(target_col)
cols_but_target = list(set(columns) - {target_col})
length = data.shape[0]
width = data.shape[1]
# 除了目标列外所有列的索引
columns_indices = [_ for _ in range(0, width) if _ != target_index]
sentences = []
for col in range(0, width):
for row in range(0, length):
cell_value = data.values[row, col]
sentences.append(cell_value)
embedding = model.encode(sentences, convert_to_tensor=True, device="cuda")
split_embedding = torch.split(embedding, length, dim=0)
table_tensor = torch.stack(split_embedding, dim=0, out=None)
norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
sim_tensor = sim_tensor.float()
sim_tensor = torch.round(sim_tensor, decimals=4)
# 小于6列的可以尝试做笛卡尔积大于6列可能指数爆炸
if width <= 6:
# 列出除目标列以外所有列的所有取值做笛卡尔积结果为所有可能MD取值
cartesian = build_cartesian(width, target_index)
# 抽取sample_number / (width - 1)条MD不含-1
if cartesian.shape[0] > sample_number / (width - 1):
index = torch.tensor(random.sample(range(cartesian.shape[0]), math.ceil(sample_number / (width - 1))), device='cuda')
cartesian = torch.index_select(cartesian, 0, index)
else:
# 随机生成sample_number / (width - 1)条MD使用randint先转化为int再除成小数不含-1
cartesian = torch.randint(int(similarity_threshold * 100), 1 * 100,
(math.ceil(sample_number / (width - 1)), width - 1), device='cuda')
cartesian = cartesian / 100
# 生成一列相似度为1的目标列插入目标列所在位置
ones = torch.ones((math.ceil(sample_number / (width - 1)), 1), device='cuda')
cartesian = torch.cat((cartesian[:, 0:target_index], ones, cartesian[:, target_index:]), 1)
cartesian = torch.round(cartesian, decimals=4)
# 此tensor将与其他置为-1的tensor拼接
joint_md_tensor = cartesian.clone()
# 随机将1列2列……置为-1
for i in range(width - 2):
index_list_format = []
for j in range(cartesian.shape[0]):
# 对每条MD随机选择将要置为-1的列索引
index_list_format.append(random.sample(columns_indices, i + 1))
index = torch.tensor(index_list_format, device='cuda')
# 随机调整为-1后的MD集合
modified_cartesian = cartesian.scatter(1, index, -1)
joint_md_tensor = torch.cat((joint_md_tensor, modified_cartesian), 0)
md_list = []
# get_metric_tensor(cartesian, data, sim_tensor, target_index)
for _ in tqdm(range(joint_md_tensor.shape[0])):
s, c = get_metrics(joint_md_tensor[_], data, sim_tensor, target_index)
if s >= support_threshold and c >= confidence_threshold:
md_list_format = [round(i, 4) for i in joint_md_tensor[_].tolist()]
md_dict_format = {}
for k in range(0, width):
md_dict_format[columns[k]] = md_list_format[k]
md_list.append((md_dict_format, s, c))
md_list.sort(key=operator.itemgetter(2), reverse=True)
return md_list

@ -1,6 +1,7 @@
from ConfigSpace import Configuration
from md_discovery.discovery_executor import pairs_inference
from md_discovery.discovery_executor_gpu import discover
from settings import *
# # 若不输出support和confidence使用以下两块代码
@ -17,16 +18,12 @@ from settings import *
# f.write(str(_) + '\n')
def md_discover(config: Configuration):
t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
def md_discover(config: Configuration, source_path, target_path):
# 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值
mds_list = pairs_inference(t_single_tuple_path, target_attr, config)
# 将列表1写入本地路径需自己修改
mds_path = md_output_dir + "mds.txt"
with open(mds_path, 'w') as f:
# mds_list = pairs_inference(source_path, target_attr)
mds_list = discover(source_path, target_attr)
with open(target_path, 'w') as f:
for _ in mds_list:
f.write('Target:'+str(target_attr) + '\t')
f.write(str(_))

@ -1,6 +1,7 @@
import json
import os
import sys
import time
import ConfigSpace
import pandas
@ -12,28 +13,24 @@ import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
import six
from ConfigSpace import Configuration
from tqdm import tqdm
from md_discovery.md_discover import md_discover
from settings import *
def process_prediction_for_md_discovery(pred: pd.DataFrame,
t_single_tuple_path: str = er_output_dir + "t_single_tuple.csv"):
# 提取预测表中真阳和假阴部分
tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
# 拼成一张表
df = pd.concat([tp, fn])
# 将真阳/假阴表中左右ID调整一致
def prepare_file_for_md_discovery(train, t_single_tuple_path=er_output_dir + "t_single_tuple.csv"):
df = train[train['gold'] == 1]
# 元组对左右ID调整一致
for index, row in df.iterrows():
df.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
pred_columns = pred.columns.values.tolist()
train_columns = train.columns.values.tolist()
l_columns = []
r_columns = []
cols = []
# 将预测表中左表和右表字段名分别加入两个列表
for _ in pred_columns:
# 左表和右表字段名分别加入两个列表
for _ in train_columns:
if _.startswith('ltable'):
l_columns.append(_)
elif _.startswith('rtable'):
@ -47,6 +44,7 @@ def process_prediction_for_md_discovery(pred: pd.DataFrame,
ldf.columns = cols
rdf.columns = cols
t_single_tuple = pd.concat([ldf, rdf])
t_single_tuple = t_single_tuple.reset_index(drop=True)
t_single_tuple.to_csv(t_single_tuple_path, sep=',', index=False, header=True, quoting=1)
@ -148,6 +146,7 @@ def build_col_pairs_sim_tensor_dict(predictions: pandas.DataFrame):
def er_process(config: Configuration):
start = time.time()
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
# ltable.fillna("", inplace=True)
@ -295,27 +294,42 @@ def er_process(config: Configuration):
predictions_attrs.extend(attrs_with_r_prefix)
predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs]
process_prediction_for_md_discovery(predictions)
train_attrs = predictions_attrs[:]
train_attrs.remove('predicted')
train_set = train_set[train_attrs]
prepare_file_for_md_discovery(train_set)
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0
md_discover(config)
md_discover(config, er_output_dir + "t_single_tuple.csv", md_output_dir + "mds.txt")
md_paths = [md_output_dir + 'mds.txt']
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
epl_match = 0 # 可解释预测match
unexplainable = pd.DataFrame()
if len(md_list) > 0:
for row in predictions.itertuples():
for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x
epl_match += 1
# else:
# series = pd.Series(row)
# unexplainable = unexplainable._append(series, ignore_index=True)
# unexplainable.drop(columns=unexplainable.columns[[-1, 0]], inplace=True)
# unexplainable.columns = predictions_attrs
# unexplainable = unexplainable[train_attrs]
# if len(unexplainable[unexplainable['gold'] == str(1)]) > 0:
# prepare_file_for_md_discovery(unexplainable, t_single_tuple_path=er_output_dir + 'unexplainable_tst.csv')
# md_discover(config, er_output_dir + 'unexplainable_tst.csv', md_output_dir + "from_unexplainable.txt")
df = predictions[predictions['predicted'] == str(1)]
interpretability = epl_match / len(df) # 可解释性
indicators['interpretability'] = interpretability
if (indicators["block_recall"] < 0.8) and (indicators["block_recall"] < indicators["recall"]):
if indicators["block_recall"] < indicators["recall"]:
f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
indicators["precision"] + indicators["block_recall"])
else:
@ -326,6 +340,7 @@ def er_process(config: Configuration):
print(indicators)
predictions.to_csv(er_output_dir + 'predictions.csv', sep=',', index=False, header=True)
################################################################################################################
print(f'\033[33mTime consumed by ML-ER in seconds: {time.time() - start}\033[0m')
return indicators

@ -1,10 +1,10 @@
from sentence_transformers import SentenceTransformer
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Amazon.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\GoogleProducts.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Amazon-Google\Mapping.csv'
mapping_lid = 'idAmazon' # mapping表中左表id名
mapping_rid = 'idGoogleBase' # mapping表中右表id名
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\matches.csv'
mapping_lid = 'idDBLP' # mapping表中左表id名
mapping_rid = 'idScholar' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
@ -12,9 +12,9 @@ target_attr = 'id' # 进行md挖掘时的目标字段
model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
interpre_weight = 0.5 # 可解释性权重
# similarity_threshold = 0.2
# support_threshold = 100
# confidence_threshold = 0.4
similarity_threshold = 0.1
support_threshold = 1
confidence_threshold = 0.25
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'

@ -147,3 +147,13 @@ def test12():
dic = json.load(f)
for _ in dic.keys():
print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}')
def test13():
outcome_dir = r'E:\Data\Research\Outcome'
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
datasets_list = os.listdir(outcome_dir)
f = []
for _ in datasets_list:
f.append(outcome_dir + rf'\{_}' + configs_dir)
print(f)

Loading…
Cancel
Save