MD-metrics-HPO
HuangJintao 1 year ago
parent 9a4dfde047
commit 6bdfd9eb51

@ -1,21 +1,12 @@
import os
import numpy as np
import torch
import json import json
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition from ConfigSpace.conditions import InCondition
from ConfigSpace.read_and_write import json as csj from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario from smac import HyperparameterOptimizationFacade, Scenario
from md_discovery.md_discover import md_discover
from settings import * from settings import *
from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable, build_col_pairs_sim_tensor_dict, \ from ml_er.ml_entity_resolver import er_process
process_prediction_for_md_discovery, er_process
class Classifier: class Classifier:
@ -29,17 +20,14 @@ class Classifier:
block_attr_items.remove(ltable_id) block_attr_items.remove(ltable_id)
block_attr = Categorical("block_attr", block_attr_items) block_attr = Categorical("block_attr", block_attr_items)
overlap_size = Integer("overlap_size", (1, 3), default=1)
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf") ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
similarity_thresh = Float("similarity_thresh", (0.2, 0.21)) similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2)
support_thresh = Integer("support_thresh", (1, 1000)) support_thresh = Integer("support_thresh", (1, 5), default=1)
confidence_thresh = Float("confidence_thresh", (0.25, 0.5)) confidence_thresh = Float("confidence_thresh", (0.3, 0.7), default=0.4)
use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"]) cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh,
cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker, support_thresh, confidence_thresh])
similarity_thresh, support_thresh, confidence_thresh])
cs.add_conditions([use_overlap_size])
return cs return cs
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估 # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
@ -48,183 +36,6 @@ class Classifier:
indicators = er_process(config) indicators = er_process(config)
return 1-indicators['performance'] return 1-indicators['performance']
# # 数据在外部加载
# ########################################################################################################################
# ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
# cm.set_key(ltable, ltable_id)
# # ltable.fillna("", inplace=True)
# rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
# cm.set_key(rtable, rtable_id)
# # rtable.fillna("", inplace=True)
# mappings = pd.read_csv(mapping_path, encoding='ISO-8859-1')
#
# lid_mapping_list = []
# rid_mapping_list = []
# # 全部转为字符串
# # ltable = ltable.astype(str)
# # rtable = rtable.astype(str)
# # mappings = mappings.astype(str)
# matching_number = len(mappings) # 所有阳性样本数商品数据集应为1300
#
# for index, row in mappings.iterrows():
# lid_mapping_list.append(row[mapping_lid])
# rid_mapping_list.append(row[mapping_rid])
# # 仅保留两表中出现在映射表中的行,增大正样本比例
# selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
# # if len(lr_attrs_map) > 0:
# # selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
# tables_id = rtable_id # 不论左表右表ID字段名是否一致经上一行调整统一以右表为准
# selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
# selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
# ########################################################################################################################
#
# attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀
# attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀
# cm.set_key(selected_ltable, tables_id)
# cm.set_key(selected_rtable, tables_id)
#
# if config["ml_blocker"] == "over_lap":
# blocker = em.OverlapBlocker()
# candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
# l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
# overlap_size=config["overlap_size"], show_progress=False,
# allow_missing=True)
# elif config["ml_blocker"] == "attr_equiv":
# blocker = em.AttrEquivalenceBlocker()
# candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
# l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
# allow_missing=True)
#
# candidate['gold'] = 0
# candidate = candidate.reset_index(drop=True)
# candidate_match_rows = []
# for line in candidate.itertuples():
# l_id = getattr(line, 'ltable_' + tables_id)
# map_row = mappings[mappings[mapping_lid] == l_id]
#
# if map_row is not None:
# r_id = map_row[mapping_rid]
# for value in r_id:
# if value == getattr(line, 'rtable_' + tables_id):
# candidate_match_rows.append(line[0])
# else:
# continue
# for _ in candidate_match_rows:
# candidate.loc[_, 'gold'] = 1
#
# candidate.fillna("", inplace=True)
#
# # 裁剪负样本,保持正负样本数量一致
# candidate_mismatch = candidate[candidate['gold'] == 0]
# candidate_match = candidate[candidate['gold'] == 1]
# if len(candidate_mismatch) > len(candidate_match):
# candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
# # 拼接正负样本
# candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
# if len(candidate_for_train_test) == 0:
# return 1
# candidate_for_train_test = candidate_for_train_test.reset_index(drop=True)
# cm.set_key(candidate_for_train_test, '_id')
# cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
# cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
# cm.set_ltable(candidate_for_train_test, selected_ltable)
# cm.set_rtable(candidate_for_train_test, selected_rtable)
#
# # 分为训练测试集
# train_proportion = 0.7
# test_proportion = 0.3
# sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
# train_set = sets['train']
# test_set = sets['test']
#
# cm.set_key(train_set, '_id')
# cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
# cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
# cm.set_ltable(train_set, selected_ltable)
# cm.set_rtable(train_set, selected_rtable)
#
# cm.set_key(test_set, '_id')
# cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
# cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
# cm.set_ltable(test_set, selected_ltable)
# cm.set_rtable(test_set, selected_rtable)
#
# if config["ml_matcher"] == "dt":
# matcher = em.DTMatcher(name='DecisionTree', random_state=0)
# elif config["ml_matcher"] == "svm":
# matcher = em.SVMMatcher(name='SVM', random_state=0)
# elif config["ml_matcher"] == "rf":
# matcher = em.RFMatcher(name='RF', random_state=0)
# elif config["ml_matcher"] == "lg":
# matcher = em.LogRegMatcher(name='LogReg', random_state=0)
# elif config["ml_matcher"] == "ln":
# matcher = em.LinRegMatcher(name='LinReg')
# elif config["ml_matcher"] == "nb":
# matcher = em.NBMatcher(name='NaiveBayes')
# feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
#
# train_feature_vecs = em.extract_feature_vecs(train_set,
# feature_table=feature_table,
# attrs_after=['gold'],
# show_progress=False)
# train_feature_vecs.fillna(value=0, inplace=True)
# test_feature_after = attrs_with_l_prefix[:]
# test_feature_after.extend(attrs_with_r_prefix)
# for _ in test_feature_after:
# if _.endswith(tables_id):
# test_feature_after.remove(_)
# test_feature_after.append('gold')
# test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
# attrs_after=test_feature_after, show_progress=False)
# test_feature_vecs.fillna(value=0, inplace=True)
# fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
# matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
#
# test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
# predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
# append=True, target_attr='predicted', inplace=False)
# eval_result = em.eval_matches(predictions, 'gold', 'predicted')
# em.print_eval_summary(eval_result)
# indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, candidate_for_train_test)
# print(indicators)
#
# # 计算可解释性
# predictions_attrs = []
# predictions_attrs.extend(attrs_with_l_prefix)
# predictions_attrs.extend(attrs_with_r_prefix)
# predictions_attrs.extend(['gold', 'predicted'])
# predictions = predictions[predictions_attrs]
# process_prediction_for_md_discovery(predictions)
# predictions = predictions.reset_index(drop=True)
# predictions = predictions.astype(str)
# sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
#
# # 默认路径为 "../md_discovery/output/xxx.txt"
# # mds/vio 共2个md文件
# md_discover(config)
# md_paths = [md_output_dir + 'mds.txt', md_output_dir + 'vio.txt']
# md_list = load_mds(md_paths) # 从全局变量中读取所有的md
# epl_match = 0 # 可解释预测match
# if len(md_list) > 0:
# for line in predictions.itertuples():
# if is_explicable(line, md_list, sim_tensor_dict) and str(getattr(line, 'predicted')) == str(1):
# epl_match += 1
#
# ppre = predictions[predictions['predicted'] == str(1)]
# interpretability = epl_match / len(ppre) # 可解释性
#
# if (indicators["block_recall"] < 0.8) and (indicators["block_recall"] < indicators["recall"]):
# f1 = (2.0 * indicators["precision"] * indicators["block_recall"]) / (
# indicators["precision"] + indicators["block_recall"])
# else:
# f1 = indicators["F1"]
# # if indicators["block_recall"] < 0.8:
# # return 1
# # f1 = indicators["F1"]
# performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
# print('Interpretability: ', interpretability)
# return 1 - performance
def ml_er_hpo(): def ml_er_hpo():
classifier = Classifier() classifier = Classifier()
@ -234,7 +45,6 @@ def ml_er_hpo():
with open(hpo_output_dir + "configspace.json", "w") as f: with open(hpo_output_dir + "configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4) json.dump(dict_configspace, f, indent=4)
# Next, we create an object, holding general information about the run
scenario = Scenario( scenario = Scenario(
cs, cs,
deterministic=True, deterministic=True,
@ -244,7 +54,6 @@ def ml_er_hpo():
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
# Now we use SMAC to find the best hyperparameters
smac = HyperparameterOptimizationFacade( smac = HyperparameterOptimizationFacade(
scenario, scenario,
classifier.train, classifier.train,

@ -1,16 +1,9 @@
import multiprocessing
import time
from concurrent.futures import ProcessPoolExecutor
from multiprocessing.managers import SharedMemoryManager
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import copy import copy
import torch import torch
from ConfigSpace import Configuration from ConfigSpace import Configuration
from tqdm import tqdm from tqdm import tqdm
from settings import model from settings import model
@ -97,59 +90,58 @@ def pairs_inference(path, target_col, conf: Configuration):
md_list.remove(md) md_list.remove(md)
violated_mds.append(md) violated_mds.append(md)
for vio_md in violated_mds:
# 特殊化左侧
for col in cols_but_target:
if sims[col] + 0.01 <= 1:
spec_l_md = copy.deepcopy(vio_md)
spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md)
if vio_md not in minimal_vio:
minimal_vio.append(vio_md)
# for vio_md in violated_mds: # for vio_md in violated_mds:
# # 特殊化左侧 # # 特殊化左侧
# for col in cols_but_target: # for col in cols_but_target:
# if sims[col] < 1: # if sims[col] + 0.01 <= 1:
# spec_l_md = copy.deepcopy(vio_md) # spec_l_md = copy.deepcopy(vio_md)
# if sims[col] < threshold: # spec_l_md[col] = simt if sims[col] < simt else sims[col] + 0.01
# spec_l_md[col] = threshold
# else:
# if sims[col] + 0.01 <= 1:
# spec_l_md[col] = sims[col] + 0.01
# else:
# spec_l_md[col] = 1
# if is_minimal(spec_l_md, md_list, target_col): # if is_minimal(spec_l_md, md_list, target_col):
# md_list.append(spec_l_md) # md_list.append(spec_l_md)
# if vio_md not in minimal_vio: # if vio_md not in minimal_vio:
# minimal_vio.append(vio_md) # minimal_vio.append(vio_md)
for vio_md in violated_mds:
vio_md_support, vio_md_confidence = get_metrics(vio_md, data, sim_tensor, target_col, target_index)
if vio_md_support >= supt:
for col in cols_but_target:
if sims[col] < 1.0:
spec_l_md = copy.deepcopy(vio_md)
if sims[col] < simt:
spec_l_md[col] = simt
else:
if sims[col] + 0.01 <= 1.0:
spec_l_md[col] = sims[col] + 0.01
else:
spec_l_md[col] = 1.0
if is_minimal(spec_l_md, md_list, target_col):
md_list.append(spec_l_md)
if vio_md not in minimal_vio:
minimal_vio.append(vio_md)
if len(md_list) == 0: if len(md_list) == 0:
terminate = True terminate = True
break break
if terminate: if terminate:
break break
tmp = [] if len(minimal_vio) > 0:
for _ in md_list: remove_list = []
if _ not in tmp: for md in minimal_vio:
tmp.append(_) support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
md_list = tmp if confidence < cont:
if len(md_list) > 0: remove_list.append(md)
for vio in minimal_vio[:]: for _ in remove_list:
if not is_minimal(vio, md_list, target_col): minimal_vio.remove(_)
minimal_vio.remove(vio)
for _ in md_list[:]:
if not is_minimal(_, md_list, target_col):
md_list.remove(_)
print('mds_list\t', len(md_list), '\n')
print('vio_list\t', len(minimal_vio), '\n')
if len(minimal_vio) == 0:
return md_list, []
remove_list = []
if len(md_list) > 0: if len(md_list) > 0:
# 去除重复MD
tmp = []
for _ in md_list:
if _ not in tmp:
tmp.append(_)
md_list = tmp
# 去除support小于阈值MD
md_rm_list = [] md_rm_list = []
for _ in md_list: for _ in md_list:
support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index) support, confidence = get_metrics(_, data, sim_tensor, target_col, target_index)
@ -157,23 +149,23 @@ def pairs_inference(path, target_col, conf: Configuration):
md_rm_list.append(_) md_rm_list.append(_)
for _ in md_rm_list: for _ in md_rm_list:
md_list.remove(_) md_list.remove(_)
for md in minimal_vio: # 去除不minimal的MD
support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index) for _ in md_list[:]:
# fuck.append((support, confidence)) if not is_minimal(_, md_list, target_col):
if support < supt: md_list.remove(_)
remove_list.append(md) if len(minimal_vio) > 0:
if confidence < cont and md not in remove_list: for vio in minimal_vio[:]:
remove_list.append(md) if not is_minimal(vio, md_list, target_col):
# fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True) minimal_vio.remove(vio)
for _ in remove_list:
minimal_vio.remove(_) if len(minimal_vio) > 0:
for _ in minimal_vio[:]:
for _ in minimal_vio[:]: if not is_minimal(_, minimal_vio, target_col):
if not is_minimal(_, minimal_vio, target_col): minimal_vio.remove(_)
minimal_vio.remove(_)
print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m') print(f'\033[33mList Length: {len(md_list)}\033[0m')
print(f"Support: {supt}\tConfidence: {cont}") print(f'\033[33mVio Length: {len(minimal_vio)}\033[0m')
print(f'\033[33mSupport: {supt}\tConfidence: {cont}\033[0m')
return md_list, minimal_vio return md_list, minimal_vio
@ -196,11 +188,11 @@ def get_metrics(current_md, data, sim_tensor, target_col, target_index):
sup_tensor_slice = sup_tensor[i] sup_tensor_slice = sup_tensor[i]
ini_slice = torch.logical_and(ini_slice, sup_tensor_slice) ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
sup_tensor_int = ini_slice.int() sup_tensor_int = ini_slice.int()
support = torch.count_nonzero(sup_tensor_int).item() support_Naumann = torch.count_nonzero(sup_tensor_int).item()
ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index]) ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
conf_tensor_int = ini_slice.int() conf_tensor_int = ini_slice.int()
confidence_numerator = torch.count_nonzero(conf_tensor_int).item() support_Fan = torch.count_nonzero(conf_tensor_int).item()
confidence = confidence_numerator / support confidence = support_Fan / support_Naumann
return support, confidence return support_Fan, confidence

@ -1,6 +1,6 @@
from ConfigSpace import Configuration from ConfigSpace import Configuration
from md_discovery import tmp_discover from md_discovery.discovery_executor import pairs_inference
from settings import * from settings import *
# # 若不输出support和confidence使用以下两块代码 # # 若不输出support和confidence使用以下两块代码
@ -21,7 +21,7 @@ def md_discover(config: Configuration):
t_single_tuple_path = er_output_dir + "t_single_tuple.csv" t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值 # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值
mds_list, vio_list = tmp_discover.pairs_inference(t_single_tuple_path, target_attr, config) mds_list, vio_list = pairs_inference(t_single_tuple_path, target_attr, config)
# 将列表1写入本地路径需自己修改 # 将列表1写入本地路径需自己修改
mds_path = md_output_dir + "mds.txt" mds_path = md_output_dir + "mds.txt"

@ -184,7 +184,7 @@ def er_process(config: Configuration):
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
config["block_attr"], allow_missing=True, config["block_attr"], allow_missing=True,
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=config["overlap_size"], show_progress=False) overlap_size=1, show_progress=False)
elif config["ml_blocker"] == "attr_equiv": elif config["ml_blocker"] == "attr_equiv":
blocker = em.AttrEquivalenceBlocker() blocker = em.AttrEquivalenceBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"],
@ -336,7 +336,6 @@ def ml_er(config: Configuration = None):
f.write('performance:' + str(indicators['performance']) + '\n') f.write('performance:' + str(indicators['performance']) + '\n')
if __name__ == '__main__': if __name__ == '__main__':
if os.path.isfile(hpo_output_dir + "incumbent.json"): if os.path.isfile(hpo_output_dir + "incumbent.json"):
with open(hpo_output_dir + "configspace.json", 'r') as f: with open(hpo_output_dir + "configspace.json", 'r') as f:

@ -1,11 +1,10 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import numpy as np
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv' ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableA.csv'
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv' rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\tableB.csv'
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv' mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Fodors-Zagats\matches.csv'
mapping_lid = 'idAbt' # mapping表中左表id名 mapping_lid = 'ltable_id' # mapping表中左表id名
mapping_rid = 'idBuy' # mapping表中右表id名 mapping_rid = 'rtable_id' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称 ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称 rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段 target_attr = 'id' # 进行md挖掘时的目标字段

@ -9,7 +9,7 @@ import pandas as pd
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from ConfigSpace.read_and_write import json as csj from ConfigSpace.read_and_write import json as csj
from md_discovery import tmp_discover from md_discovery import discovery_executor
from settings import er_output_dir, hpo_output_dir from settings import er_output_dir, hpo_output_dir

Loading…
Cancel
Save