|
|
|
import pandas as pd
|
|
|
|
import json
|
|
|
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
|
|
|
from ConfigSpace import EqualsCondition
|
|
|
|
from ConfigSpace.read_and_write import json as csj
|
|
|
|
from smac import HyperparameterOptimizationFacade, Scenario
|
|
|
|
from settings import *
|
|
|
|
from ml_er.ml_entity_resolver import er_process
|
|
|
|
from settings import ltable_path, ltable_id
|
|
|
|
|
|
|
|
|
|
|
|
class Classifier:
|
|
|
|
@property
|
|
|
|
def configspace(self) -> ConfigurationSpace:
|
|
|
|
cs = ConfigurationSpace(seed=0)
|
|
|
|
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|')
|
|
|
|
selected_attrs = ltable.columns.values.tolist()
|
|
|
|
block_attr_items = selected_attrs[:]
|
|
|
|
block_attr_items.remove(ltable_id)
|
|
|
|
|
|
|
|
jed_blocker = Categorical("jed_blocker",
|
|
|
|
["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
|
|
|
|
qgrams = Integer('qgrams', (3, 10), default=6)
|
|
|
|
use_qgrams = EqualsCondition(child=qgrams, parent=jed_blocker, value="QGrams")
|
|
|
|
block_attr = Categorical("block_attr", block_attr_items)
|
|
|
|
|
|
|
|
block_filtering_ratio = Float("block_filtering_ratio", (0.7, 0.95), default=0.8)
|
|
|
|
|
|
|
|
meta_blocker = Categorical("meta_blocker",
|
|
|
|
["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
|
|
|
|
weighting_scheme = Categorical("weighting_scheme",
|
|
|
|
['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',
|
|
|
|
'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])
|
|
|
|
|
|
|
|
# todo other hyperparameters
|
|
|
|
matching_metric = Categorical("matching_metric",
|
|
|
|
['cosine', 'euclidean'])
|
|
|
|
matching_tokenizer = Categorical("matching_tokenizer",
|
|
|
|
['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
|
|
|
|
matching_vectorizer = Categorical("matching_vectorizer",
|
|
|
|
['tfidf', 'tf', 'boolean'])
|
|
|
|
matching_sim_thresh = Float("similarity_threshold", (0.05, 0.9))
|
|
|
|
clusteror = Categorical("clusteror_name",
|
|
|
|
["CCC", "UMC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
|
|
|
|
|
|
|
|
cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
|
|
|
|
matching_tokenizer, matching_vectorizer, clusteror, qgrams, block_filtering_ratio,
|
|
|
|
matching_sim_thresh])
|
|
|
|
cs.add_conditions([use_qgrams])
|
|
|
|
return cs
|
|
|
|
|
|
|
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
|
|
indicators = er_process(config)
|
|
|
|
return 1-indicators['performance']
|
|
|
|
|
|
|
|
|
|
|
|
def ml_er_hpo():
|
|
|
|
classifier = Classifier()
|
|
|
|
cs = classifier.configspace
|
|
|
|
str_configspace = csj.write(cs)
|
|
|
|
dict_configspace = json.loads(str_configspace)
|
|
|
|
with open(hpo_output_dir + r"\configspace.json", "w") as f:
|
|
|
|
json.dump(dict_configspace, f, indent=4)
|
|
|
|
|
|
|
|
scenario = Scenario(
|
|
|
|
cs,
|
|
|
|
crash_cost=1,
|
|
|
|
deterministic=True,
|
|
|
|
n_trials=50, # We want to run max 50 trials (combination of config and seed)
|
|
|
|
n_workers=1
|
|
|
|
)
|
|
|
|
|
|
|
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
|
|
|
|
|
|
|
smac = HyperparameterOptimizationFacade(
|
|
|
|
scenario,
|
|
|
|
classifier.train,
|
|
|
|
initial_design=initial_design,
|
|
|
|
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
|
|
|
)
|
|
|
|
|
|
|
|
incumbent = smac.optimize()
|
|
|
|
incumbent_cost = smac.validate(incumbent)
|
|
|
|
default = cs.get_default_configuration()
|
|
|
|
default_cost = smac.validate(default)
|
|
|
|
print(f"Default Cost: {default_cost}")
|
|
|
|
print(f"Incumbent Cost: {incumbent_cost}")
|
|
|
|
|
|
|
|
if incumbent_cost > default_cost:
|
|
|
|
incumbent = default
|
|
|
|
print(f"Updated Incumbent Cost: {default_cost}")
|
|
|
|
|
|
|
|
print(f"Optimized Configuration:{incumbent.values()}")
|
|
|
|
|
|
|
|
with open(hpo_output_dir + r"\incumbent.json", "w") as f:
|
|
|
|
json.dump(dict(incumbent), f, indent=4)
|
|
|
|
return incumbent
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
ml_er_hpo()
|