import json import pickle from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction from ConfigSpace.read_and_write import json as csj import py_entitymatching.catalog.catalog_manager as cm import pandas as pd from smac import HyperparameterOptimizationFacade, Scenario from ml_er.magellan_new import matching from settings import * class Classifier: @property def configspace(self) -> ConfigurationSpace: cs = ConfigurationSpace(seed=0) ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf"]) # note 以tree开头的超参数是DT和RF共用的 tree_criterion = Categorical("tree_criterion", ["gini", "entropy", "log_loss"], default="gini") rf_n_estimators = Integer('number_of_tree', (10, 150)) tree_max_depth = Integer('tree_max_depth', (15, 30), default=None) rf_max_features = Categorical('rf_max_features', ["sqrt", "log2", "auto"], default='sqrt') svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf') svm_C = Integer('svm_C', (1, 100), default=1) svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale') svm_degree = Integer('svm_degree', (1, 3), default=3) svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0) dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best') dt_max_features = Categorical('dt_max_features', ["auto", "sqrt", "log2"], default=None) cs.add_hyperparameters([ml_matcher, tree_criterion, rf_n_estimators, tree_max_depth, rf_max_features, svm_kernel, svm_C, svm_gamma, svm_degree, svm_constant, dt_splitter, dt_max_features]) active_tree_criterion = InCondition(child=tree_criterion, parent=ml_matcher, values=['dt', 'rf']) active_tree_max_depth = InCondition(child=tree_max_depth, parent=ml_matcher, values=['dt', 'rf']) active_rf_n_estimators = EqualsCondition(child=rf_n_estimators, parent=ml_matcher, value="rf") active_rf_max_features = EqualsCondition(child=rf_max_features, parent=ml_matcher, value="rf") active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt") active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt") active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm") active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm") active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm") active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm") active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm") active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"]) active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly") active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"]) cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators, active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion, AndConjunction(active_svm_gamma1, active_svm_gamma2), AndConjunction(active_svm_degree1, active_svm_degree2), AndConjunction(active_svm_constant1, active_svm_constant2)]) return cs def train(self, config: Configuration, seed: int = 0) -> float: cm.del_catalog() with open(er_output_dir + "blocking_result.pickle", "rb") as file: blocking_result = pickle.load(file) indicators = matching(config, blocking_result) return 1 - indicators['performance'] def ml_er_hpo(): classifier = Classifier() cs = classifier.configspace str_configspace = csj.write(cs) dict_configspace = json.loads(str_configspace) # 将超参数空间保存本地 with open(hpo_output_dir + "configspace.json", "w") as f: json.dump(dict_configspace, f, indent=4) scenario = Scenario( cs, crash_cost=1.0, deterministic=True, n_trials=20, n_workers=1 ) initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) smac = HyperparameterOptimizationFacade( scenario, classifier.train, initial_design=initial_design, overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state ) incumbent = smac.optimize() incumbent_cost = smac.validate(incumbent) default = cs.get_default_configuration() default_cost = smac.validate(default) print(f"Default Cost: {default_cost}") print(f"Incumbent Cost: {incumbent_cost}") if incumbent_cost > default_cost: incumbent = default print(f"Updated Incumbent Cost: {default_cost}") print(f"Optimized Configuration:{incumbent.values()}") with open(hpo_output_dir + "incumbent.json", "w") as f: json.dump(dict(incumbent), f, indent=4) return incumbent if __name__ == '__main__': ml_er_hpo()