matching_dependency/hpo/magellan_hpo.py

import json
import pickle

from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
from ConfigSpace.read_and_write import json as csj
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario

from ml_er.magellan_new import matching
from settings import *


class Classifier:
    @property
    def configspace(self) -> ConfigurationSpace:
        cs = ConfigurationSpace(seed=0)

        ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf"])
        # note 以tree开头的超参数是DT和RF共用的
        tree_criterion = Categorical("tree_criterion", ["gini", "entropy", "log_loss"], default="gini")
        rf_n_estimators = Integer('number_of_tree', (10, 150))
        tree_max_depth = Integer('tree_max_depth', (15, 30), default=None)
        rf_max_features = Categorical('rf_max_features', ["sqrt", "log2", "auto"], default='sqrt')

        svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')
        svm_C = Integer('svm_C', (1, 100), default=1)
        svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')
        svm_degree = Integer('svm_degree', (1, 3), default=3)
        svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)

        dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')
        dt_max_features = Categorical('dt_max_features', ["auto", "sqrt", "log2"], default=None)

        cs.add_hyperparameters([ml_matcher, tree_criterion, rf_n_estimators, tree_max_depth, rf_max_features,
                                svm_kernel, svm_C, svm_gamma, svm_degree, svm_constant, dt_splitter, dt_max_features])

        active_tree_criterion = InCondition(child=tree_criterion, parent=ml_matcher, values=['dt', 'rf'])
        active_tree_max_depth = InCondition(child=tree_max_depth, parent=ml_matcher, values=['dt', 'rf'])
        active_rf_n_estimators = EqualsCondition(child=rf_n_estimators, parent=ml_matcher, value="rf")
        active_rf_max_features = EqualsCondition(child=rf_max_features, parent=ml_matcher, value="rf")
        active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")
        active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")
        active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")
        active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")
        active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
        active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
        active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")

        active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"])
        active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly")
        active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"])

        cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators,
                           active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion,
                           AndConjunction(active_svm_gamma1, active_svm_gamma2),
                           AndConjunction(active_svm_degree1, active_svm_degree2),
                           AndConjunction(active_svm_constant1, active_svm_constant2)])
        return cs

    def train(self, config: Configuration, seed: int = 0) -> float:
        cm.del_catalog()
        with open(er_output_dir + "blocking_result.pickle", "rb") as file:
            blocking_result = pickle.load(file)
        indicators = matching(config, blocking_result)
        return 1 - indicators['performance']


def ml_er_hpo():
    classifier = Classifier()
    cs = classifier.configspace
    str_configspace = csj.write(cs)
    dict_configspace = json.loads(str_configspace)
    # 将超参数空间保存本地
    with open(hpo_output_dir + "configspace.json", "w") as f:
        json.dump(dict_configspace, f, indent=4)

    scenario = Scenario(
        cs,
        crash_cost=1.0,
        deterministic=True,
        n_trials=20,
        n_workers=1
    )

    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

    smac = HyperparameterOptimizationFacade(
        scenario,
        classifier.train,
        initial_design=initial_design,
        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
    )

    incumbent = smac.optimize()
    incumbent_cost = smac.validate(incumbent)
    default = cs.get_default_configuration()
    default_cost = smac.validate(default)
    print(f"Default Cost: {default_cost}")
    print(f"Incumbent Cost: {incumbent_cost}")

    if incumbent_cost > default_cost:
        incumbent = default
        print(f"Updated Incumbent Cost: {default_cost}")

    print(f"Optimized Configuration:{incumbent.values()}")

    with open(hpo_output_dir + "incumbent.json", "w") as f:
        json.dump(dict(incumbent), f, indent=4)
    return incumbent


if __name__ == '__main__':
    ml_er_hpo()
commit 7 months ago			`import json`
			`import pickle`

			`from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float`
commit 7 months ago			`from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction`
commit 7 months ago			`from ConfigSpace.read_and_write import json as csj`
			`import py_entitymatching.catalog.catalog_manager as cm`
			`import pandas as pd`
			`from smac import HyperparameterOptimizationFacade, Scenario`

			`from ml_er.magellan_new import matching`
			`from settings import *`


			`class Classifier:`
			`@property`
			`def configspace(self) -> ConfigurationSpace:`
			`cs = ConfigurationSpace(seed=0)`

			`ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf"])`
			`# note 以tree开头的超参数是DT和RF共用的`
			`tree_criterion = Categorical("tree_criterion", ["gini", "entropy", "log_loss"], default="gini")`
			`rf_n_estimators = Integer('number_of_tree', (10, 150))`
			`tree_max_depth = Integer('tree_max_depth', (15, 30), default=None)`
			`rf_max_features = Categorical('rf_max_features', ["sqrt", "log2", "auto"], default='sqrt')`

			`svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')`
			`svm_C = Integer('svm_C', (1, 100), default=1)`
			`svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')`
commit 7 months ago			`svm_degree = Integer('svm_degree', (1, 3), default=3)`
commit 7 months ago			`svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)`

			`dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')`
			`dt_max_features = Categorical('dt_max_features', ["auto", "sqrt", "log2"], default=None)`

			`cs.add_hyperparameters([ml_matcher, tree_criterion, rf_n_estimators, tree_max_depth, rf_max_features,`
			`svm_kernel, svm_C, svm_gamma, svm_degree, svm_constant, dt_splitter, dt_max_features])`

			`active_tree_criterion = InCondition(child=tree_criterion, parent=ml_matcher, values=['dt', 'rf'])`
			`active_tree_max_depth = InCondition(child=tree_max_depth, parent=ml_matcher, values=['dt', 'rf'])`
			`active_rf_n_estimators = EqualsCondition(child=rf_n_estimators, parent=ml_matcher, value="rf")`
			`active_rf_max_features = EqualsCondition(child=rf_max_features, parent=ml_matcher, value="rf")`
			`active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")`
			`active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")`
			`active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")`
			`active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")`
commit 7 months ago			`active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")`
			`active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")`
			`active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")`

			`active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"])`
			`active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly")`
			`active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"])`

			`cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators,`
			`active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion,`
			`AndConjunction(active_svm_gamma1, active_svm_gamma2),`
			`AndConjunction(active_svm_degree1, active_svm_degree2),`
			`AndConjunction(active_svm_constant1, active_svm_constant2)])`
commit 7 months ago			`return cs`

			`def train(self, config: Configuration, seed: int = 0) -> float:`
			`cm.del_catalog()`
			`with open(er_output_dir + "blocking_result.pickle", "rb") as file:`
			`blocking_result = pickle.load(file)`
			`indicators = matching(config, blocking_result)`
			`return 1 - indicators['performance']`


			`def ml_er_hpo():`
			`classifier = Classifier()`
			`cs = classifier.configspace`
			`str_configspace = csj.write(cs)`
			`dict_configspace = json.loads(str_configspace)`
			`# 将超参数空间保存本地`
			`with open(hpo_output_dir + "configspace.json", "w") as f:`
			`json.dump(dict_configspace, f, indent=4)`

			`scenario = Scenario(`
			`cs,`
			`crash_cost=1.0,`
			`deterministic=True,`
commit 7 months ago			`n_trials=20,`
commit 7 months ago			`n_workers=1`
			`)`

			`initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)`

			`smac = HyperparameterOptimizationFacade(`
			`scenario,`
			`classifier.train,`
			`initial_design=initial_design,`
			`overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state`
			`)`

			`incumbent = smac.optimize()`
			`incumbent_cost = smac.validate(incumbent)`
			`default = cs.get_default_configuration()`
			`default_cost = smac.validate(default)`
			`print(f"Default Cost: {default_cost}")`
			`print(f"Incumbent Cost: {incumbent_cost}")`

			`if incumbent_cost > default_cost:`
			`incumbent = default`
			`print(f"Updated Incumbent Cost: {default_cost}")`

			`print(f"Optimized Configuration:{incumbent.values()}")`

			`with open(hpo_output_dir + "incumbent.json", "w") as f:`
			`json.dump(dict(incumbent), f, indent=4)`
			`return incumbent`


			`if __name__ == '__main__':`
			`ml_er_hpo()`