You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
4.1 KiB

import pandas as pd
import json
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace import EqualsCondition
from ConfigSpace.read_and_write import json as csj
from smac import HyperparameterOptimizationFacade, Scenario
from settings import *
from ml_er.ml_entity_resolver import er_process
from settings import ltable_path, ltable_id
class Classifier:
@property
def configspace(self) -> ConfigurationSpace:
cs = ConfigurationSpace(seed=0)
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|')
selected_attrs = ltable.columns.values.tolist()
block_attr_items = selected_attrs[:]
block_attr_items.remove(ltable_id)
jed_blocker = Categorical("jed_blocker",
["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
qgrams = Integer('qgrams', (3, 10), default=6)
use_qgrams = EqualsCondition(child=qgrams, parent=jed_blocker, value="QGrams")
block_attr = Categorical("block_attr", block_attr_items)
block_filtering_ratio = Float("block_filtering_ratio", (0.7, 0.95), default=0.8)
meta_blocker = Categorical("meta_blocker",
["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
weighting_scheme = Categorical("weighting_scheme",
['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',
'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])
# todo other hyperparameters
matching_metric = Categorical("matching_metric",
['cosine', 'euclidean'])
matching_tokenizer = Categorical("matching_tokenizer",
['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
matching_vectorizer = Categorical("matching_vectorizer",
['tfidf', 'tf', 'boolean'])
matching_sim_thresh = Float("similarity_threshold", (0.05, 0.9))
clusteror = Categorical("clusteror_name",
["CCC", "UMC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
matching_tokenizer, matching_vectorizer, clusteror, qgrams, block_filtering_ratio,
matching_sim_thresh])
cs.add_conditions([use_qgrams])
return cs
def train(self, config: Configuration, seed: int = 0) -> float:
indicators = er_process(config)
return 1-indicators['performance']
def ml_er_hpo():
classifier = Classifier()
cs = classifier.configspace
str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace)
with open(hpo_output_dir + r"\configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4)
scenario = Scenario(
cs,
crash_cost=1,
deterministic=True,
n_trials=50, # We want to run max 50 trials (combination of config and seed)
n_workers=1
)
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
smac = HyperparameterOptimizationFacade(
scenario,
classifier.train,
initial_design=initial_design,
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
)
incumbent = smac.optimize()
incumbent_cost = smac.validate(incumbent)
default = cs.get_default_configuration()
default_cost = smac.validate(default)
print(f"Default Cost: {default_cost}")
print(f"Incumbent Cost: {incumbent_cost}")
if incumbent_cost > default_cost:
incumbent = default
print(f"Updated Incumbent Cost: {default_cost}")
print(f"Optimized Configuration:{incumbent.values()}")
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
json.dump(dict(incumbent), f, indent=4)
return incumbent
if __name__ == '__main__':
ml_er_hpo()