You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
matching_dependency/hpo/magellan_hpo.py

118 lines
5.3 KiB

11 months ago
import json
import pickle
import time
11 months ago
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
10 months ago
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
11 months ago
from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
from colorama import Fore, init
from smac import HyperparameterOptimizationFacade, Scenario, BlackBoxFacade
11 months ago
9 months ago
from ml_er.magellan_er import matching
11 months ago
from settings import *
9 months ago
class Optimization:
11 months ago
@property
def configspace(self) -> ConfigurationSpace:
cs = ConfigurationSpace(seed=0)
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf"])
# note 以tree开头的超参数是DT和RF共用的
tree_criterion = Categorical("tree_criterion", ["gini", "entropy", "log_loss"], default="gini")
rf_n_estimators = Integer('number_of_tree', (10, 150))
tree_max_depth = Integer('tree_max_depth', (15, 30), default=None)
rf_max_features = Categorical('rf_max_features', ["sqrt", "log2", "auto"], default='sqrt')
svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')
svm_C = Integer('svm_C', (1, 100), default=1)
svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')
10 months ago
svm_degree = Integer('svm_degree', (1, 3), default=3)
11 months ago
svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)
dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')
dt_max_features = Categorical('dt_max_features', ["auto", "sqrt", "log2"], default=None)
cs.add_hyperparameters([ml_matcher, tree_criterion, rf_n_estimators, tree_max_depth, rf_max_features,
svm_kernel, svm_C, svm_gamma, svm_degree, svm_constant, dt_splitter, dt_max_features])
active_tree_criterion = InCondition(child=tree_criterion, parent=ml_matcher, values=['dt', 'rf'])
active_tree_max_depth = InCondition(child=tree_max_depth, parent=ml_matcher, values=['dt', 'rf'])
active_rf_n_estimators = EqualsCondition(child=rf_n_estimators, parent=ml_matcher, value="rf")
active_rf_max_features = EqualsCondition(child=rf_max_features, parent=ml_matcher, value="rf")
active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")
active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")
active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")
active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")
10 months ago
active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"])
active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly")
active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"])
cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators,
active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion,
AndConjunction(active_svm_gamma1, active_svm_gamma2),
AndConjunction(active_svm_degree1, active_svm_degree2),
AndConjunction(active_svm_constant1, active_svm_constant2)])
11 months ago
return cs
def train(self, config: Configuration, seed: int = 0) -> float:
cm.del_catalog()
indicators = matching(config)
11 months ago
return 1 - indicators['performance']
def ml_er_hpo():
9 months ago
optimization = Optimization()
cs = optimization.configspace
11 months ago
str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace)
# 将超参数空间保存本地
with open(hpo_output_dir + r"\configspace.json", "w") as f:
11 months ago
json.dump(dict_configspace, f, indent=4)
scenario = Scenario(
cs,
crash_cost=1.0,
deterministic=True,
n_trials=16,
11 months ago
n_workers=1
)
initial_design = BlackBoxFacade.get_initial_design(scenario, n_configs=5)
11 months ago
smac = BlackBoxFacade(
11 months ago
scenario,
9 months ago
optimization.train,
11 months ago
initial_design=initial_design,
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
)
incumbent = smac.optimize()
incumbent_cost = smac.validate(incumbent)
default = cs.get_default_configuration()
default_cost = smac.validate(default)
print(Fore.BLUE + f"Default Cost: {default_cost}")
print(Fore.BLUE + f"Incumbent Cost: {incumbent_cost}")
11 months ago
if incumbent_cost > default_cost:
incumbent = default
print(Fore.RED + f'Updated Incumbent Cost: {default_cost}')
11 months ago
print(Fore.BLUE + f"Optimized Configuration:{incumbent.values()}")
11 months ago
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
11 months ago
json.dump(dict(incumbent), f, indent=4)
return incumbent
if __name__ == '__main__':
init(autoreset=True)
print(Fore.CYAN + f'Start Time: {time.time()}')
11 months ago
ml_er_hpo()