parent
b21b0aa496
commit
9b06ce3840
@ -1,73 +0,0 @@
|
|||||||
import json
|
|
||||||
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
|
||||||
from ConfigSpace.conditions import InCondition
|
|
||||||
from ConfigSpace.read_and_write import json as csj
|
|
||||||
import py_entitymatching.catalog.catalog_manager as cm
|
|
||||||
import pandas as pd
|
|
||||||
from smac import HyperparameterOptimizationFacade, Scenario
|
|
||||||
from settings import *
|
|
||||||
from ml_er.ml_entity_resolver import er_process
|
|
||||||
|
|
||||||
|
|
||||||
class Classifier:
|
|
||||||
@property
|
|
||||||
def configspace(self) -> ConfigurationSpace:
|
|
||||||
cs = ConfigurationSpace(seed=0)
|
|
||||||
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
|
|
||||||
# todo 每个分类器的超参数
|
|
||||||
tree_criterion = Categorical("dt_criterion", ["gini", "entropy", "log_loss"], default="gini")
|
|
||||||
|
|
||||||
|
|
||||||
cs.add_hyperparameters([ml_matcher])
|
|
||||||
return cs
|
|
||||||
|
|
||||||
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
||||||
cm.del_catalog()
|
|
||||||
indicators = er_process(config)
|
|
||||||
return 1-indicators['performance']
|
|
||||||
|
|
||||||
|
|
||||||
def ml_er_hpo():
|
|
||||||
classifier = Classifier()
|
|
||||||
cs = classifier.configspace
|
|
||||||
str_configspace = csj.write(cs)
|
|
||||||
dict_configspace = json.loads(str_configspace)
|
|
||||||
with open(hpo_output_dir + "configspace.json", "w") as f:
|
|
||||||
json.dump(dict_configspace, f, indent=4)
|
|
||||||
|
|
||||||
scenario = Scenario(
|
|
||||||
cs,
|
|
||||||
deterministic=True,
|
|
||||||
n_trials=12, # We want to run max 50 trials (combination of config and seed)
|
|
||||||
n_workers=1
|
|
||||||
)
|
|
||||||
|
|
||||||
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
|
||||||
|
|
||||||
smac = HyperparameterOptimizationFacade(
|
|
||||||
scenario,
|
|
||||||
classifier.train,
|
|
||||||
initial_design=initial_design,
|
|
||||||
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
|
||||||
)
|
|
||||||
|
|
||||||
incumbent = smac.optimize()
|
|
||||||
incumbent_cost = smac.validate(incumbent)
|
|
||||||
default = cs.get_default_configuration()
|
|
||||||
default_cost = smac.validate(default)
|
|
||||||
print(f"Default Cost: {default_cost}")
|
|
||||||
print(f"Incumbent Cost: {incumbent_cost}")
|
|
||||||
|
|
||||||
if incumbent_cost > default_cost:
|
|
||||||
incumbent = default
|
|
||||||
print(f"Updated Incumbent Cost: {default_cost}")
|
|
||||||
|
|
||||||
print(f"Optimized Configuration:{incumbent.values()}")
|
|
||||||
|
|
||||||
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
|
||||||
json.dump(dict(incumbent), f, indent=4)
|
|
||||||
return incumbent
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
ml_er_hpo()
|
|
@ -0,0 +1,110 @@
|
|||||||
|
import json
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
||||||
|
from ConfigSpace.conditions import InCondition, EqualsCondition
|
||||||
|
from ConfigSpace.read_and_write import json as csj
|
||||||
|
import py_entitymatching.catalog.catalog_manager as cm
|
||||||
|
import pandas as pd
|
||||||
|
from smac import HyperparameterOptimizationFacade, Scenario
|
||||||
|
|
||||||
|
from ml_er.magellan_new import matching
|
||||||
|
from settings import *
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier:
|
||||||
|
@property
|
||||||
|
def configspace(self) -> ConfigurationSpace:
|
||||||
|
cs = ConfigurationSpace(seed=0)
|
||||||
|
|
||||||
|
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf"])
|
||||||
|
# note 以tree开头的超参数是DT和RF共用的
|
||||||
|
tree_criterion = Categorical("tree_criterion", ["gini", "entropy", "log_loss"], default="gini")
|
||||||
|
rf_n_estimators = Integer('number_of_tree', (10, 150))
|
||||||
|
tree_max_depth = Integer('tree_max_depth', (15, 30), default=None)
|
||||||
|
rf_max_features = Categorical('rf_max_features', ["sqrt", "log2", "auto"], default='sqrt')
|
||||||
|
|
||||||
|
svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')
|
||||||
|
svm_C = Integer('svm_C', (1, 100), default=1)
|
||||||
|
svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')
|
||||||
|
svm_degree = Integer('svm_degree', (1, 5), default=3)
|
||||||
|
svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)
|
||||||
|
|
||||||
|
dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')
|
||||||
|
dt_max_features = Categorical('dt_max_features', ["auto", "sqrt", "log2"], default=None)
|
||||||
|
|
||||||
|
cs.add_hyperparameters([ml_matcher, tree_criterion, rf_n_estimators, tree_max_depth, rf_max_features,
|
||||||
|
svm_kernel, svm_C, svm_gamma, svm_degree, svm_constant, dt_splitter, dt_max_features])
|
||||||
|
|
||||||
|
active_tree_criterion = InCondition(child=tree_criterion, parent=ml_matcher, values=['dt', 'rf'])
|
||||||
|
active_tree_max_depth = InCondition(child=tree_max_depth, parent=ml_matcher, values=['dt', 'rf'])
|
||||||
|
active_rf_n_estimators = EqualsCondition(child=rf_n_estimators, parent=ml_matcher, value="rf")
|
||||||
|
active_rf_max_features = EqualsCondition(child=rf_max_features, parent=ml_matcher, value="rf")
|
||||||
|
active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")
|
||||||
|
active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")
|
||||||
|
active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_gamma = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_degree = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_constant = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")
|
||||||
|
|
||||||
|
cs.add_conditions([active_svm_C, active_svm_constant, active_svm_degree, active_svm_gamma, active_svm_kernel,
|
||||||
|
active_dt_splitter, active_rf_n_estimators, active_dt_max_features, active_rf_max_features,
|
||||||
|
active_tree_max_depth, active_tree_criterion])
|
||||||
|
|
||||||
|
return cs
|
||||||
|
|
||||||
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
||||||
|
cm.del_catalog()
|
||||||
|
with open(er_output_dir + "blocking_result.pickle", "rb") as file:
|
||||||
|
blocking_result = pickle.load(file)
|
||||||
|
indicators = matching(config, blocking_result)
|
||||||
|
return 1 - indicators['performance']
|
||||||
|
|
||||||
|
|
||||||
|
def ml_er_hpo():
|
||||||
|
classifier = Classifier()
|
||||||
|
cs = classifier.configspace
|
||||||
|
str_configspace = csj.write(cs)
|
||||||
|
dict_configspace = json.loads(str_configspace)
|
||||||
|
# 将超参数空间保存本地
|
||||||
|
with open(hpo_output_dir + "configspace.json", "w") as f:
|
||||||
|
json.dump(dict_configspace, f, indent=4)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
cs,
|
||||||
|
crash_cost=1.0,
|
||||||
|
deterministic=True,
|
||||||
|
n_trials=50,
|
||||||
|
n_workers=1
|
||||||
|
)
|
||||||
|
|
||||||
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
||||||
|
|
||||||
|
smac = HyperparameterOptimizationFacade(
|
||||||
|
scenario,
|
||||||
|
classifier.train,
|
||||||
|
initial_design=initial_design,
|
||||||
|
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
||||||
|
)
|
||||||
|
|
||||||
|
incumbent = smac.optimize()
|
||||||
|
incumbent_cost = smac.validate(incumbent)
|
||||||
|
default = cs.get_default_configuration()
|
||||||
|
default_cost = smac.validate(default)
|
||||||
|
print(f"Default Cost: {default_cost}")
|
||||||
|
print(f"Incumbent Cost: {incumbent_cost}")
|
||||||
|
|
||||||
|
if incumbent_cost > default_cost:
|
||||||
|
incumbent = default
|
||||||
|
print(f"Updated Incumbent Cost: {default_cost}")
|
||||||
|
|
||||||
|
print(f"Optimized Configuration:{incumbent.values()}")
|
||||||
|
|
||||||
|
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
||||||
|
json.dump(dict(incumbent), f, indent=4)
|
||||||
|
return incumbent
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
ml_er_hpo()
|
@ -0,0 +1,4 @@
|
|||||||
|
from ml_er.magellan_new import blocking_mining
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
blocking_mining()
|
Loading…
Reference in new issue