parent
23ff0e6643
commit
894b69e9a7
@ -0,0 +1,93 @@
|
||||
import json
|
||||
import time
|
||||
from colorama import init, Fore
|
||||
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
||||
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
|
||||
from ConfigSpace.read_and_write import json as csj
|
||||
from smac import Scenario, BlackBoxFacade
|
||||
|
||||
from ml_er.ditto_er import matching
|
||||
from setting import hpo_output_dir
|
||||
import sys
|
||||
sys.path.append('/root/hjt/md_bayesian_er_ditto/')
|
||||
|
||||
|
||||
class Optimization:
|
||||
@property
|
||||
def configspace(self) -> ConfigurationSpace:
|
||||
cs = ConfigurationSpace(seed=0)
|
||||
|
||||
# task
|
||||
# run_id
|
||||
batch_size = Categorical('batch_size', [32, 64], default=64)
|
||||
max_len = Categorical('max_len', [64, 128, 256], default=256)
|
||||
# lr 3e-5
|
||||
# n_epochs 20
|
||||
# fine_tuning
|
||||
# save_model
|
||||
# logdir
|
||||
lm = Categorical('language_model', ['distilbert', 'roberta', 'bert-base-uncased', 'xlnet-base-cased'], default='distilbert')
|
||||
fp16 = Categorical('half_precision_float', [True, False])
|
||||
da = Categorical('data_augmentation', ['del', 'swap', 'drop_col', 'append_col', 'all'])
|
||||
# alpha_aug
|
||||
# dk
|
||||
summarize = Categorical('summarize', [True, False])
|
||||
# size
|
||||
|
||||
cs.add_hyperparameters([batch_size, max_len, lm, fp16, da, summarize])
|
||||
return cs
|
||||
|
||||
# todo train函数
|
||||
def train(self, config: Configuration, seed: int = 0, ) -> float:
|
||||
indicators = matching(config)
|
||||
return 1 - indicators['performance']
|
||||
|
||||
|
||||
def ml_er_hpo():
|
||||
optimization = Optimization()
|
||||
cs = optimization.configspace
|
||||
str_configspace = csj.write(cs)
|
||||
dict_configspace = json.loads(str_configspace)
|
||||
# 将超参数空间保存本地
|
||||
with open(hpo_output_dir + r"\configspace.json", "w") as f:
|
||||
json.dump(dict_configspace, f, indent=4)
|
||||
|
||||
scenario = Scenario(
|
||||
cs,
|
||||
crash_cost=1.0,
|
||||
deterministic=True,
|
||||
n_trials=16,
|
||||
n_workers=1
|
||||
)
|
||||
|
||||
initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)
|
||||
|
||||
smac = BlackBoxFacade(
|
||||
scenario,
|
||||
optimization.train,
|
||||
initial_design=initial_design,
|
||||
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
||||
)
|
||||
|
||||
incumbent = smac.optimize()
|
||||
incumbent_cost = smac.validate(incumbent)
|
||||
default = cs.get_default_configuration()
|
||||
default_cost = smac.validate(default)
|
||||
print(Fore.BLUE + f"Default Cost: {default_cost}")
|
||||
print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")
|
||||
|
||||
if incumbent_cost > default_cost:
|
||||
incumbent = default
|
||||
print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')
|
||||
|
||||
print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")
|
||||
|
||||
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
|
||||
json.dump(dict(incumbent), f, indent=4)
|
||||
return incumbent
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
init(autoreset=True)
|
||||
print(Fore.CYAN + f'Start Time: {time.time()}')
|
||||
ml_er_hpo()
|
@ -0,0 +1,141 @@
|
||||
import pickle
|
||||
import torch
|
||||
import json
|
||||
import numpy as np
|
||||
import random
|
||||
# from ditto.matcher import *
|
||||
from setting import *
|
||||
from colorama import Fore
|
||||
from argparse import Namespace
|
||||
import ConfigSpace
|
||||
from ConfigSpace import Configuration
|
||||
from ditto.matcher import set_seed, to_str, classify, predict, tune_threshold, load_model
|
||||
from ConfigSpace.read_and_write import json as csj
|
||||
from ditto.ditto_light.dataset import DittoDataset
|
||||
from ditto.ditto_light.summarize import Summarizer
|
||||
from ditto.ditto_light.knowledge import *
|
||||
from ditto.ditto_light.ditto import train
|
||||
import os
|
||||
import sys
|
||||
|
||||
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
||||
sys.path.append('/root/hjt/md_bayesian_er_ditto/')
|
||||
|
||||
|
||||
def matching(config):
|
||||
print(Fore.BLUE + f'Config: {config}')
|
||||
|
||||
# with open(md_output_dir + r"\mds.pickle", "rb") as file:
|
||||
# md_list = pickle.load(file)
|
||||
|
||||
hp = Namespace()
|
||||
hp.task = directory_path.replace('/root/hjt/DeepMatcher Dataset/', '')
|
||||
hp.run_id = 0
|
||||
hp.batch_size = config['batch_size']
|
||||
hp.max_len = config['max_len']
|
||||
hp.lr = 3e-5
|
||||
hp.n_epochs = 20
|
||||
# hp.finetuning
|
||||
hp.save_model = True
|
||||
hp.input_path = '/root/autodl-tmp/input/candidates_small.jsonl'
|
||||
hp.output_path = '/root/autodl-tmp/output/matched_small.jsonl'
|
||||
hp.logdir = '/root/autodl-tmp/checkpoints/'
|
||||
hp.checkpoint_path = '/root/autodl-tmp/checkpoints/'
|
||||
hp.lm = config['language_model']
|
||||
hp.fp16 = config['half_precision_float']
|
||||
hp.da = config['data_augmentation']
|
||||
hp.alpha_aug = 0.8
|
||||
hp.dk = None
|
||||
hp.summarize = config['summarize']
|
||||
hp.size = None
|
||||
hp.use_gpu = True
|
||||
|
||||
seed = hp.run_id
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
# only a single task for baseline
|
||||
task = hp.task
|
||||
|
||||
# create the tag of the run
|
||||
run_tag = '%s_lm=%s_da=%s_dk=%s_su=%s_size=%s_id=%d' % (task, hp.lm, hp.da,
|
||||
hp.dk, hp.summarize, str(hp.size), hp.run_id)
|
||||
run_tag = run_tag.replace('/', '_')
|
||||
|
||||
# load task configuration
|
||||
configs = json.load(open('configs.json'))
|
||||
configs = {conf['name']: conf for conf in configs}
|
||||
config = configs[task]
|
||||
|
||||
trainset = config['trainset']
|
||||
validset = config['validset']
|
||||
testset = config['testset']
|
||||
|
||||
# summarize the sequences up to the max sequence length
|
||||
if hp.summarize:
|
||||
summarizer = Summarizer(config, lm=hp.lm)
|
||||
trainset = summarizer.transform_file(trainset, max_len=hp.max_len)
|
||||
validset = summarizer.transform_file(validset, max_len=hp.max_len)
|
||||
testset = summarizer.transform_file(testset, max_len=hp.max_len)
|
||||
|
||||
# load train/dev/test sets
|
||||
train_dataset = DittoDataset(trainset,
|
||||
lm=hp.lm,
|
||||
max_len=hp.max_len,
|
||||
size=hp.size,
|
||||
da=hp.da)
|
||||
valid_dataset = DittoDataset(validset, lm=hp.lm)
|
||||
test_dataset = DittoDataset(testset, lm=hp.lm)
|
||||
|
||||
# train and evaluate the model
|
||||
train(train_dataset,
|
||||
valid_dataset,
|
||||
test_dataset,
|
||||
run_tag, hp)
|
||||
|
||||
set_seed(123)
|
||||
config, model = load_model(hp.task, hp.checkpoint_path,
|
||||
hp.lm, hp.use_gpu, hp.fp16)
|
||||
|
||||
summarizer = dk_injector = None
|
||||
if hp.summarize:
|
||||
summarizer = Summarizer(config, hp.lm)
|
||||
|
||||
# tune threshold
|
||||
threshold = tune_threshold(config, model, hp)
|
||||
|
||||
# run prediction
|
||||
predict(hp.input_path, hp.output_path, config, model,
|
||||
summarizer=summarizer,
|
||||
max_len=hp.max_len,
|
||||
lm=hp.lm,
|
||||
dk_injector=dk_injector,
|
||||
threshold=threshold)
|
||||
# todo indicators
|
||||
# write results
|
||||
# interpretability
|
||||
|
||||
|
||||
# todo ml_er function
|
||||
def ml_er(config: Configuration):
|
||||
indicators = matching(config)
|
||||
output_path = er_output_dir + r"\eval_result.txt"
|
||||
with open(output_path, 'w') as _f:
|
||||
_f.write('F1:' + str(indicators["F1"]) + '\n')
|
||||
_f.write('interpretability:' + str(indicators['interpretability']) + '\n')
|
||||
_f.write('performance:' + str(indicators['performance']) + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if os.path.isfile(hpo_output_dir + r"\incumbent.json"):
|
||||
with open(hpo_output_dir + r"\configspace.json", 'r') as f:
|
||||
dict_configspace = json.load(f)
|
||||
str_configspace = json.dumps(dict_configspace)
|
||||
configspace = csj.read(str_configspace)
|
||||
with open(hpo_output_dir + r"\incumbent.json", 'r') as f:
|
||||
dic = json.load(f)
|
||||
configuration = ConfigSpace.Configuration(configspace, values=dic)
|
||||
ml_er(configuration)
|
@ -0,0 +1,12 @@
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
directory_path = '/root/hjt/DeepMatcher Dataset/Structured/Amazon-Google'
|
||||
|
||||
er_output_dir = '/root/hjt/md_bayesian_er_ditto/ml_er/output'
|
||||
md_output_dir = '/root/hjt/md_bayesian_er_ditto/md_discovery/output'
|
||||
hpo_output_dir = '/root/hjt/md_bayesian_er_ditto/hpo/output'
|
||||
|
||||
# model = SentenceTransformer('/root/hjt/all-MiniLM-L6-v2')
|
||||
interpre_weight = 0 # 可解释性权重
|
||||
support_threshold = 1
|
||||
confidence_threshold = 0.75
|
Loading…
Reference in new issue