import json from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace.conditions import InCondition from ConfigSpace.read_and_write import json as csj import py_entitymatching.catalog.catalog_manager as cm import pandas as pd from smac import HyperparameterOptimizationFacade, Scenario from settings import * from ml_er.ml_entity_resolver import er_process class Classifier: @property def configspace(self) -> ConfigurationSpace: cs = ConfigurationSpace(seed=0) ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') selected_attrs = ltable.columns.values.tolist() block_attr_items = selected_attrs[:] block_attr_items.remove(ltable_id) block_attr = Categorical("block_attr", block_attr_items) ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker]) return cs def train(self, config: Configuration, seed: int = 0) -> float: cm.del_catalog() indicators = er_process(config) return 1-indicators['performance'] def ml_er_hpo(): classifier = Classifier() cs = classifier.configspace str_configspace = csj.write(cs) dict_configspace = json.loads(str_configspace) with open(hpo_output_dir + "configspace.json", "w") as f: json.dump(dict_configspace, f, indent=4) scenario = Scenario( cs, deterministic=True, n_trials=12, # We want to run max 50 trials (combination of config and seed) n_workers=1 ) initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) smac = HyperparameterOptimizationFacade( scenario, classifier.train, initial_design=initial_design, overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state ) incumbent = smac.optimize() incumbent_cost = smac.validate(incumbent) default = cs.get_default_configuration() default_cost = smac.validate(default) print(f"Default Cost: {default_cost}") print(f"Incumbent Cost: {incumbent_cost}") if incumbent_cost > default_cost: incumbent = default print(f"Updated Incumbent Cost: {default_cost}") print(f"Optimized Configuration:{incumbent.values()}") with open(hpo_output_dir + "incumbent.json", "w") as f: json.dump(dict(incumbent), f, indent=4) return incumbent if __name__ == '__main__': ml_er_hpo()