在预测结果中记录对应MD

打印运行时间
main
HuangJintao 7 months ago
parent d79453e5c3
commit d1d4b0d430

4
.gitignore vendored

@ -1 +1,3 @@
/md_discovery/output/mds.txt /md_discovery/output/*
/ml_er/output/*
/hpo/output/*

@ -1,8 +1,10 @@
import json import json
import time
from colorama import init, Fore
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
from ConfigSpace.read_and_write import json as csj from ConfigSpace.read_and_write import json as csj
from smac import Scenario, HyperparameterOptimizationFacade from smac import Scenario, BlackBoxFacade
from ml_er.deepmatcher_er import matching from ml_er.deepmatcher_er import matching
from setting import hpo_output_dir from setting import hpo_output_dir
@ -31,31 +33,32 @@ class Optimization:
return cs return cs
def train(self, config: Configuration, seed: int = 0) -> float: def train(self, config: Configuration, seed: int = 0, ) -> float:
indicators = matching(config) indicators = matching(config)
return 1 - indicators['performance'] return 1 - indicators['performance']
def ml_er_hpo(): def ml_er_hpo():
# init(autoreset=True)
optimization = Optimization() optimization = Optimization()
cs = optimization.configspace cs = optimization.configspace
str_configspace = csj.write(cs) str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace) dict_configspace = json.loads(str_configspace)
# 将超参数空间保存本地 # 将超参数空间保存本地
with open(hpo_output_dir + "configspace.json", "w") as f: with open(hpo_output_dir + r"\configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4) json.dump(dict_configspace, f, indent=4)
scenario = Scenario( scenario = Scenario(
cs, cs,
crash_cost=1.0, crash_cost=1.0,
deterministic=True, deterministic=True,
n_trials=20, n_trials=16,
n_workers=1 n_workers=1
) )
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)
smac = HyperparameterOptimizationFacade( smac = BlackBoxFacade(
scenario, scenario,
optimization.train, optimization.train,
initial_design=initial_design, initial_design=initial_design,
@ -66,19 +69,21 @@ def ml_er_hpo():
incumbent_cost = smac.validate(incumbent) incumbent_cost = smac.validate(incumbent)
default = cs.get_default_configuration() default = cs.get_default_configuration()
default_cost = smac.validate(default) default_cost = smac.validate(default)
print(f"Default Cost: {default_cost}") print(Fore.BLUE + f"Default Cost: {default_cost}")
print(f"Incumbent Cost: {incumbent_cost}") print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")
if incumbent_cost > default_cost: if incumbent_cost > default_cost:
incumbent = default incumbent = default
print(f"Updated Incumbent Cost: {default_cost}") print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')
print(f"Optimized Configuration:{incumbent.values()}") print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")
with open(hpo_output_dir + "incumbent.json", "w") as f: with open(hpo_output_dir + r"\incumbent.json", "w") as f:
json.dump(dict(incumbent), f, indent=4) json.dump(dict(incumbent), f, indent=4)
return incumbent return incumbent
if __name__ == '__main__': if __name__ == '__main__':
init(autoreset=True)
print(Fore.CYAN + f'Start Time: {time.time()}')
ml_er_hpo() ml_er_hpo()

@ -18,6 +18,7 @@ from setting import *
def mining(train: pd.DataFrame): def mining(train: pd.DataFrame):
# data is train set, in which each row represents a tuple pair # data is train set, in which each row represents a tuple pair
train = train.astype(str) train = train.astype(str)
# 将label列移到最后
train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1) train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1)
# 尝试不将左右表key手动调整相同而是只看gold属性是否为1 # 尝试不将左右表key手动调整相同而是只看gold属性是否为1
@ -217,5 +218,5 @@ def merge_mds(md_list_):
if __name__ == '__main__': if __name__ == '__main__':
_train = pd.read_csv(directory_path + r'\train_whole.csv') _train = pd.read_csv(directory_path + r'\train_whole.csv')
result = mining(_train) result = mining(_train)
with open(md_output_dir + "mds.pickle", "wb") as file_: with open(md_output_dir + r"\mds.pickle", "wb") as file_:
pickle.dump(result, file_) pickle.dump(result, file_)

@ -10,16 +10,17 @@ import ConfigSpace
from ConfigSpace import Configuration from ConfigSpace import Configuration
from ConfigSpace.read_and_write import json as csj from ConfigSpace.read_and_write import json as csj
import torch.nn.functional import torch.nn.functional
from colorama import init, Fore
from tqdm import tqdm from tqdm import tqdm
from setting import * from setting import *
def matching(config): def matching(config):
print(f'\033[33mConfig: {config}\033[0m') # init(autoreset=True)
start = time.time() print(Fore.BLUE + f'Config: {config}')
with open(md_output_dir + "mds.pickle", "rb") as file: with open(md_output_dir + r"\mds.pickle", "rb") as file:
md_list = pickle.load(file) md_list = pickle.load(file)
train, valid, test = dm.data.process( train, valid, test = dm.data.process(
@ -30,9 +31,9 @@ def matching(config):
use_magellan_convention=True, # 与Magellan命名风格相同 use_magellan_convention=True, # 与Magellan命名风格相同
embeddings=config['embeddings']) embeddings=config['embeddings'])
train_table = train.get_raw_table() # train_table = train.get_raw_table()
test_table = test.get_raw_table() # test_table = test.get_raw_table()
valid_table = valid.get_raw_table() # valid_table = valid.get_raw_table()
attr_summarizer = config['attr_summarizer'] attr_summarizer = config['attr_summarizer']
if attr_summarizer == 'sif': if attr_summarizer == 'sif':
@ -79,21 +80,25 @@ def matching(config):
indicators = {} indicators = {}
f1_score = model_.run_eval(test, device='cuda') f1_score = model_.run_eval(test, device='cuda')
indicators["F1"] = f1_score indicators["F1"] = f1_score.item() / 100
predictions = model_.run_prediction(test, device='cuda', output_attributes=True) predictions = model_.run_prediction(test, device='cuda', output_attributes=True)
# predictions中没有predicted列, 根据match_score手动新增 deepmatcher在计算F1时的阈值为0.5 # predictions中没有predicted列, 根据match_score手动新增 deepmatcher在计算F1时的阈值为0.5
predictions['predicted'] = predictions['match_score'].apply(lambda score: 1 if score >= 0.5 else 0) predictions['predicted'] = predictions['match_score'].apply(lambda score: 1 if score >= 0.5 else 0)
predictions = predictions.reset_index(drop=True)
predictions = predictions.astype(str)
# 目前predictions包含的属性左右表全部属性+label+predicted+match_score+_id # 目前predictions包含的属性左右表全部属性+label+predicted+match_score+_id
sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions)
predictions['confidence'] = 0 predictions['confidence'] = 0
predictions['md'] = ''
epl_match = 0 # 可解释预测match epl_match = 0 # 可解释预测match
if len(md_list) > 0: if len(md_list) > 0:
for row in tqdm(predictions.itertuples()): for row in tqdm(predictions.itertuples()):
x = is_explicable(row, md_list, sim_tensor_dict) conf, md_dict = is_explicable(row, md_list, sim_tensor_dict)
if x > 0 and str(getattr(row, 'predicted')) == str(1): if conf > 0 and str(getattr(row, 'predicted')) == str(1):
predictions.loc[row[0], 'confidence'] = x predictions.loc[row[0], 'confidence'] = conf
predictions.loc[row[0], 'md'] = str(md_dict)
epl_match += 1 epl_match += 1
df = predictions[predictions['predicted'] == str(1)] df = predictions[predictions['predicted'] == str(1)]
@ -101,10 +106,10 @@ def matching(config):
indicators['interpretability'] = interpretability indicators['interpretability'] = interpretability
performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"] performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"]
indicators['performance'] = performance indicators['performance'] = performance
print(f'ER Indicators: {indicators}') print(Fore.BLUE + f'ER Indicators: {indicators}')
predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True) predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True)
print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m') print(Fore.CYAN + f'Finish Time: {time.time()}')
return indicators return indicators
@ -148,13 +153,13 @@ def is_explicable(row, all_mds: list, st_dict):
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组 explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable: if explicable:
return md_tuple[2] # 任意一条md能解释直接返回 return md_tuple[2], md_tuple[0] # 任意一条md能解释直接返回
return -1.0 # 遍历结束,不能解释 return -1.0, {} # 遍历结束,不能解释
def ml_er(config: Configuration): def ml_er(config: Configuration):
indicators = matching(config) indicators = matching(config)
output_path = er_output_dir + "eval_result.txt" output_path = er_output_dir + r"\eval_result.txt"
with open(output_path, 'w') as _f: with open(output_path, 'w') as _f:
_f.write('F1:' + str(indicators["F1"]) + '\n') _f.write('F1:' + str(indicators["F1"]) + '\n')
_f.write('interpretability:' + str(indicators['interpretability']) + '\n') _f.write('interpretability:' + str(indicators['interpretability']) + '\n')
@ -162,12 +167,12 @@ def ml_er(config: Configuration):
if __name__ == '__main__': if __name__ == '__main__':
if os.path.isfile(hpo_output_dir + "incumbent.json"): if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
with open(hpo_output_dir + "configspace.json", 'r') as f: with open(hpo_output_dir + r"\configspace.json", 'r') as f:
dict_configspace = json.load(f) dict_configspace = json.load(f)
str_configspace = json.dumps(dict_configspace) str_configspace = json.dumps(dict_configspace)
configspace = csj.read(str_configspace) configspace = csj.read(str_configspace)
with open(hpo_output_dir + "incumbent.json", 'r') as f: with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
dic = json.load(f) dic = json.load(f)
configuration = ConfigSpace.Configuration(configspace, values=dic) configuration = ConfigSpace.Configuration(configspace, values=dic)
ml_er(configuration) ml_er(configuration)

@ -1,6 +1,6 @@
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Textual\Abt-Buy' directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Structured\iTunes-Amazon'
er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\ml_er\output' er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\ml_er\output'
md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\md_discovery\output' md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\md_discovery\output'

Loading…
Cancel
Save