From d1d4b0d430e5ff6d9fa35563d59878f12e2b2d3b Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Tue, 7 May 2024 22:18:59 +0800 Subject: [PATCH] =?UTF-8?q?=E5=9C=A8=E9=A2=84=E6=B5=8B=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E4=B8=AD=E8=AE=B0=E5=BD=95=E5=AF=B9=E5=BA=94MD=20=E6=89=93?= =?UTF-8?q?=E5=8D=B0=E8=BF=90=E8=A1=8C=E6=97=B6=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 +++- hpo/deepmatcher_hpo.py | 27 +++++++++++++++----------- md_discovery/md_mining.py | 3 ++- ml_er/deepmatcher_er.py | 41 ++++++++++++++++++++++----------------- setting.py | 2 +- 5 files changed, 45 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index af5d226..45e379d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -/md_discovery/output/mds.txt +/md_discovery/output/* +/ml_er/output/* +/hpo/output/* diff --git a/hpo/deepmatcher_hpo.py b/hpo/deepmatcher_hpo.py index c7db9c2..462b925 100644 --- a/hpo/deepmatcher_hpo.py +++ b/hpo/deepmatcher_hpo.py @@ -1,8 +1,10 @@ import json +import time +from colorama import init, Fore from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction from ConfigSpace.read_and_write import json as csj -from smac import Scenario, HyperparameterOptimizationFacade +from smac import Scenario, BlackBoxFacade from ml_er.deepmatcher_er import matching from setting import hpo_output_dir @@ -31,31 +33,32 @@ class Optimization: return cs - def train(self, config: Configuration, seed: int = 0) -> float: + def train(self, config: Configuration, seed: int = 0, ) -> float: indicators = matching(config) return 1 - indicators['performance'] def ml_er_hpo(): + # init(autoreset=True) optimization = Optimization() cs = optimization.configspace str_configspace = csj.write(cs) dict_configspace = json.loads(str_configspace) # 将超参数空间保存本地 - with open(hpo_output_dir + "configspace.json", "w") as f: + with open(hpo_output_dir + r"\configspace.json", "w") as f: json.dump(dict_configspace, f, indent=4) scenario = Scenario( cs, crash_cost=1.0, deterministic=True, - n_trials=20, + n_trials=16, n_workers=1 ) - initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) + initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5) - smac = HyperparameterOptimizationFacade( + smac = BlackBoxFacade( scenario, optimization.train, initial_design=initial_design, @@ -66,19 +69,21 @@ def ml_er_hpo(): incumbent_cost = smac.validate(incumbent) default = cs.get_default_configuration() default_cost = smac.validate(default) - print(f"Default Cost: {default_cost}") - print(f"Incumbent Cost: {incumbent_cost}") + print(Fore.BLUE + f"Default Cost: {default_cost}") + print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}") if incumbent_cost > default_cost: incumbent = default - print(f"Updated Incumbent Cost: {default_cost}") + print(Fore.RED + f'Updated Incumbent Cost: {default_cost}') - print(f"Optimized Configuration:{incumbent.values()}") + print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}") - with open(hpo_output_dir + "incumbent.json", "w") as f: + with open(hpo_output_dir + r"\incumbent.json", "w") as f: json.dump(dict(incumbent), f, indent=4) return incumbent if __name__ == '__main__': + init(autoreset=True) + print(Fore.CYAN + f'Start Time: {time.time()}') ml_er_hpo() diff --git a/md_discovery/md_mining.py b/md_discovery/md_mining.py index 56010f1..e077c1e 100644 --- a/md_discovery/md_mining.py +++ b/md_discovery/md_mining.py @@ -18,6 +18,7 @@ from setting import * def mining(train: pd.DataFrame): # data is train set, in which each row represents a tuple pair train = train.astype(str) + # 将label列移到最后 train = pd.concat([train, pd.DataFrame({'label': train.pop('label')})], axis=1) # 尝试不将左右表key手动调整相同,而是只看gold属性是否为1 @@ -217,5 +218,5 @@ def merge_mds(md_list_): if __name__ == '__main__': _train = pd.read_csv(directory_path + r'\train_whole.csv') result = mining(_train) - with open(md_output_dir + "mds.pickle", "wb") as file_: + with open(md_output_dir + r"\mds.pickle", "wb") as file_: pickle.dump(result, file_) diff --git a/ml_er/deepmatcher_er.py b/ml_er/deepmatcher_er.py index 45a31b3..7c7a157 100644 --- a/ml_er/deepmatcher_er.py +++ b/ml_er/deepmatcher_er.py @@ -10,16 +10,17 @@ import ConfigSpace from ConfigSpace import Configuration from ConfigSpace.read_and_write import json as csj import torch.nn.functional +from colorama import init, Fore from tqdm import tqdm from setting import * def matching(config): - print(f'\033[33mConfig: {config}\033[0m') - start = time.time() + # init(autoreset=True) + print(Fore.BLUE + f'Config: {config}') - with open(md_output_dir + "mds.pickle", "rb") as file: + with open(md_output_dir + r"\mds.pickle", "rb") as file: md_list = pickle.load(file) train, valid, test = dm.data.process( @@ -30,9 +31,9 @@ def matching(config): use_magellan_convention=True, # 与Magellan命名风格相同 embeddings=config['embeddings']) - train_table = train.get_raw_table() - test_table = test.get_raw_table() - valid_table = valid.get_raw_table() + # train_table = train.get_raw_table() + # test_table = test.get_raw_table() + # valid_table = valid.get_raw_table() attr_summarizer = config['attr_summarizer'] if attr_summarizer == 'sif': @@ -79,21 +80,25 @@ def matching(config): indicators = {} f1_score = model_.run_eval(test, device='cuda') - indicators["F1"] = f1_score + indicators["F1"] = f1_score.item() / 100 predictions = model_.run_prediction(test, device='cuda', output_attributes=True) # predictions中没有predicted列, 根据match_score手动新增 deepmatcher在计算F1时的阈值为0.5 predictions['predicted'] = predictions['match_score'].apply(lambda score: 1 if score >= 0.5 else 0) + predictions = predictions.reset_index(drop=True) + predictions = predictions.astype(str) # 目前predictions包含的属性:左右表全部属性+label+predicted+match_score+_id sim_tensor_dict = build_col_pairs_sim_tensor_dict(predictions) predictions['confidence'] = 0 + predictions['md'] = '' epl_match = 0 # 可解释,预测match if len(md_list) > 0: for row in tqdm(predictions.itertuples()): - x = is_explicable(row, md_list, sim_tensor_dict) - if x > 0 and str(getattr(row, 'predicted')) == str(1): - predictions.loc[row[0], 'confidence'] = x + conf, md_dict = is_explicable(row, md_list, sim_tensor_dict) + if conf > 0 and str(getattr(row, 'predicted')) == str(1): + predictions.loc[row[0], 'confidence'] = conf + predictions.loc[row[0], 'md'] = str(md_dict) epl_match += 1 df = predictions[predictions['predicted'] == str(1)] @@ -101,10 +106,10 @@ def matching(config): indicators['interpretability'] = interpretability performance = interpre_weight * interpretability + (1 - interpre_weight) * indicators["F1"] indicators['performance'] = performance - print(f'ER Indicators: {indicators}') + print(Fore.BLUE + f'ER Indicators: {indicators}') predictions.to_csv(er_output_dir + r'\predictions.csv', sep=',', index=False, header=True) - print(f'\033[33mTime consumed by matching in seconds: {time.time() - start}\033[0m') + print(Fore.CYAN + f'Finish Time: {time.time()}') return indicators @@ -148,13 +153,13 @@ def is_explicable(row, all_mds: list, st_dict): explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 break # 不再与当前md的其他相似度阈值比较,跳转到下一条md if explicable: - return md_tuple[2] # 任意一条md能解释,直接返回 - return -1.0 # 遍历结束,不能解释 + return md_tuple[2], md_tuple[0] # 任意一条md能解释,直接返回 + return -1.0, {} # 遍历结束,不能解释 def ml_er(config: Configuration): indicators = matching(config) - output_path = er_output_dir + "eval_result.txt" + output_path = er_output_dir + r"\eval_result.txt" with open(output_path, 'w') as _f: _f.write('F1:' + str(indicators["F1"]) + '\n') _f.write('interpretability:' + str(indicators['interpretability']) + '\n') @@ -162,12 +167,12 @@ def ml_er(config: Configuration): if __name__ == '__main__': - if os.path.isfile(hpo_output_dir + "incumbent.json"): - with open(hpo_output_dir + "configspace.json", 'r') as f: + if os.path.isfile(hpo_output_dir + r"\incumbent.json"): + with open(hpo_output_dir + r"\configspace.json", 'r') as f: dict_configspace = json.load(f) str_configspace = json.dumps(dict_configspace) configspace = csj.read(str_configspace) - with open(hpo_output_dir + "incumbent.json", 'r') as f: + with open(hpo_output_dir + r"\incumbent.json", 'r') as f: dic = json.load(f) configuration = ConfigSpace.Configuration(configspace, values=dic) ml_er(configuration) diff --git a/setting.py b/setting.py index 0f7b67c..a76028e 100644 --- a/setting.py +++ b/setting.py @@ -1,6 +1,6 @@ from sentence_transformers import SentenceTransformer -directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Textual\Abt-Buy' +directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Structured\iTunes-Amazon' er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\ml_er\output' md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\md_discovery\output'