You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
4.1 KiB

10 months ago
import pandas as pd
import json
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace import EqualsCondition
10 months ago
from ConfigSpace.read_and_write import json as csj
from smac import HyperparameterOptimizationFacade, Scenario
from settings import *
from ml_er.ml_entity_resolver import er_process
from settings import ltable_path, ltable_id
class Classifier:
@property
def configspace(self) -> ConfigurationSpace:
cs = ConfigurationSpace(seed=0)
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|')
selected_attrs = ltable.columns.values.tolist()
block_attr_items = selected_attrs[:]
block_attr_items.remove(ltable_id)
jed_blocker = Categorical("jed_blocker",
["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
qgrams = Integer('qgrams', (3, 10), default=6)
use_qgrams = EqualsCondition(child=qgrams, parent=jed_blocker, value="QGrams")
10 months ago
block_attr = Categorical("block_attr", block_attr_items)
block_filtering_ratio = Float("block_filtering_ratio", (0.7, 0.95), default=0.8)
10 months ago
meta_blocker = Categorical("meta_blocker",
["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
weighting_scheme = Categorical("weighting_scheme",
['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',
'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])
# todo other hyperparameters
matching_metric = Categorical("matching_metric",
['cosine', 'euclidean'])
matching_tokenizer = Categorical("matching_tokenizer",
['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
matching_vectorizer = Categorical("matching_vectorizer",
['tfidf', 'tf', 'boolean'])
matching_sim_thresh = Float("similarity_threshold", (0.05, 0.9))
10 months ago
clusteror = Categorical("clusteror_name",
["CCC", "UMC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
10 months ago
cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
matching_tokenizer, matching_vectorizer, clusteror, qgrams, block_filtering_ratio,
matching_sim_thresh])
cs.add_conditions([use_qgrams])
10 months ago
return cs
def train(self, config: Configuration, seed: int = 0) -> float:
indicators = er_process(config)
return 1-indicators['performance']
def ml_er_hpo():
classifier = Classifier()
cs = classifier.configspace
str_configspace = csj.write(cs)
dict_configspace = json.loads(str_configspace)
with open(hpo_output_dir + r"\configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4)
scenario = Scenario(
cs,
crash_cost=1,
10 months ago
deterministic=True,
n_trials=50, # We want to run max 50 trials (combination of config and seed)
n_workers=1
)
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
smac = HyperparameterOptimizationFacade(
scenario,
classifier.train,
initial_design=initial_design,
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
)
incumbent = smac.optimize()
incumbent_cost = smac.validate(incumbent)
default = cs.get_default_configuration()
default_cost = smac.validate(default)
print(f"Default Cost: {default_cost}")
print(f"Incumbent Cost: {incumbent_cost}")
if incumbent_cost > default_cost:
incumbent = default
print(f"Updated Incumbent Cost: {default_cost}")
print(f"Optimized Configuration:{incumbent.values()}")
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
json.dump(dict(incumbent), f, indent=4)
return incumbent
if __name__ == '__main__':
ml_er_hpo()