import pandas as pd import json from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float from ConfigSpace import EqualsCondition from ConfigSpace.read_and_write import json as csj from smac import HyperparameterOptimizationFacade, Scenario from settings import * from ml_er.ml_entity_resolver import er_process from settings import ltable_path, ltable_id class Classifier: @property def configspace(self) -> ConfigurationSpace: cs = ConfigurationSpace(seed=0) ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|') selected_attrs = ltable.columns.values.tolist() block_attr_items = selected_attrs[:] block_attr_items.remove(ltable_id) jed_blocker = Categorical("jed_blocker", ["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"]) qgrams = Integer('qgrams', (3, 10), default=6) use_qgrams = EqualsCondition(child=qgrams, parent=jed_blocker, value="QGrams") block_attr = Categorical("block_attr", block_attr_items) block_filtering_ratio = Float("block_filtering_ratio", (0.7, 0.95), default=0.8) meta_blocker = Categorical("meta_blocker", ["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"]) weighting_scheme = Categorical("weighting_scheme", ['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ', 'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2']) # todo other hyperparameters matching_metric = Categorical("matching_metric", ['cosine', 'euclidean']) matching_tokenizer = Categorical("matching_tokenizer", ['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer']) matching_vectorizer = Categorical("matching_vectorizer", ['tfidf', 'tf', 'boolean']) matching_sim_thresh = Float("similarity_threshold", (0.05, 0.9)) clusteror = Categorical("clusteror_name", ["CCC", "UMC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"]) cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric, matching_tokenizer, matching_vectorizer, clusteror, qgrams, block_filtering_ratio, matching_sim_thresh]) cs.add_conditions([use_qgrams]) return cs def train(self, config: Configuration, seed: int = 0) -> float: indicators = er_process(config) return 1-indicators['performance'] def ml_er_hpo(): classifier = Classifier() cs = classifier.configspace str_configspace = csj.write(cs) dict_configspace = json.loads(str_configspace) with open(hpo_output_dir + r"\configspace.json", "w") as f: json.dump(dict_configspace, f, indent=4) scenario = Scenario( cs, crash_cost=1, deterministic=True, n_trials=50, # We want to run max 50 trials (combination of config and seed) n_workers=1 ) initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) smac = HyperparameterOptimizationFacade( scenario, classifier.train, initial_design=initial_design, overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state ) incumbent = smac.optimize() incumbent_cost = smac.validate(incumbent) default = cs.get_default_configuration() default_cost = smac.validate(default) print(f"Default Cost: {default_cost}") print(f"Incumbent Cost: {incumbent_cost}") if incumbent_cost > default_cost: incumbent = default print(f"Updated Incumbent Cost: {default_cost}") print(f"Optimized Configuration:{incumbent.values()}") with open(hpo_output_dir + r"\incumbent.json", "w") as f: json.dump(dict(incumbent), f, indent=4) return incumbent if __name__ == '__main__': ml_er_hpo()