You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
84 lines
3.1 KiB
84 lines
3.1 KiB
import json
|
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
|
from ConfigSpace.conditions import InCondition
|
|
from ConfigSpace.read_and_write import json as csj
|
|
import py_entitymatching.catalog.catalog_manager as cm
|
|
import pandas as pd
|
|
from smac import HyperparameterOptimizationFacade, Scenario
|
|
from settings import *
|
|
from ml_er.ml_entity_resolver import er_process
|
|
|
|
|
|
class Classifier:
|
|
@property
|
|
def configspace(self) -> ConfigurationSpace:
|
|
# Build Configuration Space which defines all parameters and their ranges
|
|
cs = ConfigurationSpace(seed=0)
|
|
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
|
|
selected_attrs = ltable.columns.values.tolist()
|
|
block_attr_items = selected_attrs[:]
|
|
block_attr_items.remove(ltable_id)
|
|
|
|
block_attr = Categorical("block_attr", block_attr_items)
|
|
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
|
|
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
|
|
similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2)
|
|
support_thresh = Integer("support_thresh", (1, 5), default=1)
|
|
confidence_thresh = Float("confidence_thresh", (0.25, 0.5), default=0.25)
|
|
|
|
cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh,
|
|
support_thresh, confidence_thresh])
|
|
return cs
|
|
|
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
cm.del_catalog()
|
|
indicators = er_process(config)
|
|
return 1-indicators['performance']
|
|
|
|
|
|
def ml_er_hpo():
|
|
classifier = Classifier()
|
|
cs = classifier.configspace
|
|
str_configspace = csj.write(cs)
|
|
dict_configspace = json.loads(str_configspace)
|
|
with open(hpo_output_dir + "configspace.json", "w") as f:
|
|
json.dump(dict_configspace, f, indent=4)
|
|
|
|
scenario = Scenario(
|
|
cs,
|
|
deterministic=True,
|
|
n_trials=50, # We want to run max 50 trials (combination of config and seed)
|
|
walltime_limit=28800, # Max time limit in seconds (14400s = 4h)
|
|
n_workers=1
|
|
)
|
|
|
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
|
|
|
smac = HyperparameterOptimizationFacade(
|
|
scenario,
|
|
classifier.train,
|
|
initial_design=initial_design,
|
|
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
|
)
|
|
|
|
incumbent = smac.optimize()
|
|
incumbent_cost = smac.validate(incumbent)
|
|
default = cs.get_default_configuration()
|
|
default_cost = smac.validate(default)
|
|
print(f"Default Cost: {default_cost}")
|
|
print(f"Incumbent Cost: {incumbent_cost}")
|
|
|
|
if incumbent_cost > default_cost:
|
|
incumbent = default
|
|
print(f"Updated Incumbent Cost: {default_cost}")
|
|
|
|
print(f"Optimized Configuration:{incumbent.values()}")
|
|
|
|
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
|
json.dump(dict(incumbent), f, indent=4)
|
|
return incumbent
|
|
|
|
|
|
if __name__ == '__main__':
|
|
ml_er_hpo()
|