parent
23ff0e6643
commit
894b69e9a7
@ -0,0 +1,93 @@
|
|||||||
|
import json
|
||||||
|
import time
|
||||||
|
from colorama import init, Fore
|
||||||
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
||||||
|
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
|
||||||
|
from ConfigSpace.read_and_write import json as csj
|
||||||
|
from smac import Scenario, BlackBoxFacade
|
||||||
|
|
||||||
|
from ml_er.ditto_er import matching
|
||||||
|
from setting import hpo_output_dir
|
||||||
|
import sys
|
||||||
|
sys.path.append('/root/hjt/md_bayesian_er_ditto/')
|
||||||
|
|
||||||
|
|
||||||
|
class Optimization:
|
||||||
|
@property
|
||||||
|
def configspace(self) -> ConfigurationSpace:
|
||||||
|
cs = ConfigurationSpace(seed=0)
|
||||||
|
|
||||||
|
# task
|
||||||
|
# run_id
|
||||||
|
batch_size = Categorical('batch_size', [32, 64], default=64)
|
||||||
|
max_len = Categorical('max_len', [64, 128, 256], default=256)
|
||||||
|
# lr 3e-5
|
||||||
|
# n_epochs 20
|
||||||
|
# fine_tuning
|
||||||
|
# save_model
|
||||||
|
# logdir
|
||||||
|
lm = Categorical('language_model', ['distilbert', 'roberta', 'bert-base-uncased', 'xlnet-base-cased'], default='distilbert')
|
||||||
|
fp16 = Categorical('half_precision_float', [True, False])
|
||||||
|
da = Categorical('data_augmentation', ['del', 'swap', 'drop_col', 'append_col', 'all'])
|
||||||
|
# alpha_aug
|
||||||
|
# dk
|
||||||
|
summarize = Categorical('summarize', [True, False])
|
||||||
|
# size
|
||||||
|
|
||||||
|
cs.add_hyperparameters([batch_size, max_len, lm, fp16, da, summarize])
|
||||||
|
return cs
|
||||||
|
|
||||||
|
# todo train函数
|
||||||
|
def train(self, config: Configuration, seed: int = 0, ) -> float:
|
||||||
|
indicators = matching(config)
|
||||||
|
return 1 - indicators['performance']
|
||||||
|
|
||||||
|
|
||||||
|
def ml_er_hpo():
|
||||||
|
optimization = Optimization()
|
||||||
|
cs = optimization.configspace
|
||||||
|
str_configspace = csj.write(cs)
|
||||||
|
dict_configspace = json.loads(str_configspace)
|
||||||
|
# 将超参数空间保存本地
|
||||||
|
with open(hpo_output_dir + r"\configspace.json", "w") as f:
|
||||||
|
json.dump(dict_configspace, f, indent=4)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
cs,
|
||||||
|
crash_cost=1.0,
|
||||||
|
deterministic=True,
|
||||||
|
n_trials=16,
|
||||||
|
n_workers=1
|
||||||
|
)
|
||||||
|
|
||||||
|
initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)
|
||||||
|
|
||||||
|
smac = BlackBoxFacade(
|
||||||
|
scenario,
|
||||||
|
optimization.train,
|
||||||
|
initial_design=initial_design,
|
||||||
|
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
||||||
|
)
|
||||||
|
|
||||||
|
incumbent = smac.optimize()
|
||||||
|
incumbent_cost = smac.validate(incumbent)
|
||||||
|
default = cs.get_default_configuration()
|
||||||
|
default_cost = smac.validate(default)
|
||||||
|
print(Fore.BLUE + f"Default Cost: {default_cost}")
|
||||||
|
print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")
|
||||||
|
|
||||||
|
if incumbent_cost > default_cost:
|
||||||
|
incumbent = default
|
||||||
|
print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')
|
||||||
|
|
||||||
|
print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")
|
||||||
|
|
||||||
|
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
|
||||||
|
json.dump(dict(incumbent), f, indent=4)
|
||||||
|
return incumbent
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
init(autoreset=True)
|
||||||
|
print(Fore.CYAN + f'Start Time: {time.time()}')
|
||||||
|
ml_er_hpo()
|
@ -0,0 +1,141 @@
|
|||||||
|
import pickle
|
||||||
|
import torch
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
# from ditto.matcher import *
|
||||||
|
from setting import *
|
||||||
|
from colorama import Fore
|
||||||
|
from argparse import Namespace
|
||||||
|
import ConfigSpace
|
||||||
|
from ConfigSpace import Configuration
|
||||||
|
from ditto.matcher import set_seed, to_str, classify, predict, tune_threshold, load_model
|
||||||
|
from ConfigSpace.read_and_write import json as csj
|
||||||
|
from ditto.ditto_light.dataset import DittoDataset
|
||||||
|
from ditto.ditto_light.summarize import Summarizer
|
||||||
|
from ditto.ditto_light.knowledge import *
|
||||||
|
from ditto.ditto_light.ditto import train
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
||||||
|
sys.path.append('/root/hjt/md_bayesian_er_ditto/')
|
||||||
|
|
||||||
|
|
||||||
|
def matching(config):
|
||||||
|
print(Fore.BLUE + f'Config: {config}')
|
||||||
|
|
||||||
|
# with open(md_output_dir + r"\mds.pickle", "rb") as file:
|
||||||
|
# md_list = pickle.load(file)
|
||||||
|
|
||||||
|
hp = Namespace()
|
||||||
|
hp.task = directory_path.replace('/root/hjt/DeepMatcher Dataset/', '')
|
||||||
|
hp.run_id = 0
|
||||||
|
hp.batch_size = config['batch_size']
|
||||||
|
hp.max_len = config['max_len']
|
||||||
|
hp.lr = 3e-5
|
||||||
|
hp.n_epochs = 20
|
||||||
|
# hp.finetuning
|
||||||
|
hp.save_model = True
|
||||||
|
hp.input_path = '/root/autodl-tmp/input/candidates_small.jsonl'
|
||||||
|
hp.output_path = '/root/autodl-tmp/output/matched_small.jsonl'
|
||||||
|
hp.logdir = '/root/autodl-tmp/checkpoints/'
|
||||||
|
hp.checkpoint_path = '/root/autodl-tmp/checkpoints/'
|
||||||
|
hp.lm = config['language_model']
|
||||||
|
hp.fp16 = config['half_precision_float']
|
||||||
|
hp.da = config['data_augmentation']
|
||||||
|
hp.alpha_aug = 0.8
|
||||||
|
hp.dk = None
|
||||||
|
hp.summarize = config['summarize']
|
||||||
|
hp.size = None
|
||||||
|
hp.use_gpu = True
|
||||||
|
|
||||||
|
seed = hp.run_id
|
||||||
|
random.seed(seed)
|
||||||
|
np.random.seed(seed)
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
|
||||||
|
# only a single task for baseline
|
||||||
|
task = hp.task
|
||||||
|
|
||||||
|
# create the tag of the run
|
||||||
|
run_tag = '%s_lm=%s_da=%s_dk=%s_su=%s_size=%s_id=%d' % (task, hp.lm, hp.da,
|
||||||
|
hp.dk, hp.summarize, str(hp.size), hp.run_id)
|
||||||
|
run_tag = run_tag.replace('/', '_')
|
||||||
|
|
||||||
|
# load task configuration
|
||||||
|
configs = json.load(open('configs.json'))
|
||||||
|
configs = {conf['name']: conf for conf in configs}
|
||||||
|
config = configs[task]
|
||||||
|
|
||||||
|
trainset = config['trainset']
|
||||||
|
validset = config['validset']
|
||||||
|
testset = config['testset']
|
||||||
|
|
||||||
|
# summarize the sequences up to the max sequence length
|
||||||
|
if hp.summarize:
|
||||||
|
summarizer = Summarizer(config, lm=hp.lm)
|
||||||
|
trainset = summarizer.transform_file(trainset, max_len=hp.max_len)
|
||||||
|
validset = summarizer.transform_file(validset, max_len=hp.max_len)
|
||||||
|
testset = summarizer.transform_file(testset, max_len=hp.max_len)
|
||||||
|
|
||||||
|
# load train/dev/test sets
|
||||||
|
train_dataset = DittoDataset(trainset,
|
||||||
|
lm=hp.lm,
|
||||||
|
max_len=hp.max_len,
|
||||||
|
size=hp.size,
|
||||||
|
da=hp.da)
|
||||||
|
valid_dataset = DittoDataset(validset, lm=hp.lm)
|
||||||
|
test_dataset = DittoDataset(testset, lm=hp.lm)
|
||||||
|
|
||||||
|
# train and evaluate the model
|
||||||
|
train(train_dataset,
|
||||||
|
valid_dataset,
|
||||||
|
test_dataset,
|
||||||
|
run_tag, hp)
|
||||||
|
|
||||||
|
set_seed(123)
|
||||||
|
config, model = load_model(hp.task, hp.checkpoint_path,
|
||||||
|
hp.lm, hp.use_gpu, hp.fp16)
|
||||||
|
|
||||||
|
summarizer = dk_injector = None
|
||||||
|
if hp.summarize:
|
||||||
|
summarizer = Summarizer(config, hp.lm)
|
||||||
|
|
||||||
|
# tune threshold
|
||||||
|
threshold = tune_threshold(config, model, hp)
|
||||||
|
|
||||||
|
# run prediction
|
||||||
|
predict(hp.input_path, hp.output_path, config, model,
|
||||||
|
summarizer=summarizer,
|
||||||
|
max_len=hp.max_len,
|
||||||
|
lm=hp.lm,
|
||||||
|
dk_injector=dk_injector,
|
||||||
|
threshold=threshold)
|
||||||
|
# todo indicators
|
||||||
|
# write results
|
||||||
|
# interpretability
|
||||||
|
|
||||||
|
|
||||||
|
# todo ml_er function
|
||||||
|
def ml_er(config: Configuration):
|
||||||
|
indicators = matching(config)
|
||||||
|
output_path = er_output_dir + r"\eval_result.txt"
|
||||||
|
with open(output_path, 'w') as _f:
|
||||||
|
_f.write('F1:' + str(indicators["F1"]) + '\n')
|
||||||
|
_f.write('interpretability:' + str(indicators['interpretability']) + '\n')
|
||||||
|
_f.write('performance:' + str(indicators['performance']) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
|
||||||
|
with open(hpo_output_dir + r"\configspace.json", 'r') as f:
|
||||||
|
dict_configspace = json.load(f)
|
||||||
|
str_configspace = json.dumps(dict_configspace)
|
||||||
|
configspace = csj.read(str_configspace)
|
||||||
|
with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
|
||||||
|
dic = json.load(f)
|
||||||
|
configuration = ConfigSpace.Configuration(configspace, values=dic)
|
||||||
|
ml_er(configuration)
|
@ -0,0 +1,12 @@
|
|||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
directory_path = '/root/hjt/DeepMatcher Dataset/Structured/Amazon-Google'
|
||||||
|
|
||||||
|
er_output_dir = '/root/hjt/md_bayesian_er_ditto/ml_er/output'
|
||||||
|
md_output_dir = '/root/hjt/md_bayesian_er_ditto/md_discovery/output'
|
||||||
|
hpo_output_dir = '/root/hjt/md_bayesian_er_ditto/hpo/output'
|
||||||
|
|
||||||
|
# model = SentenceTransformer('/root/hjt/all-MiniLM-L6-v2')
|
||||||
|
interpre_weight = 0 # 可解释性权重
|
||||||
|
support_threshold = 1
|
||||||
|
confidence_threshold = 0.75
|
Loading…
Reference in new issue