matching_dependency_pyJedAI/hpo/er_model_hpo.py

import pandas as pd
import json
from time import *
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.read_and_write import json as csj
from smac import HyperparameterOptimizationFacade, Scenario
from settings import *
from ml_er.ml_entity_resolver import er_process
from settings import ltable_path, ltable_id


class Classifier:
    @property
    def configspace(self) -> ConfigurationSpace:
        cs = ConfigurationSpace(seed=0)
        ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='|')
        selected_attrs = ltable.columns.values.tolist()
        block_attr_items = selected_attrs[:]
        block_attr_items.remove(ltable_id)

        jed_blocker = Categorical("jed_blocker",
                                  ["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
        block_attr = Categorical("block_attr", block_attr_items)
        # filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8)
        meta_blocker = Categorical("meta_blocker",
                                   ["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
        weighting_scheme = Categorical("weighting_scheme",
                                       ['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',
                                        'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])

        # todo other hyperparameters
        matching_metric = Categorical("matching_metric",
                                      ['cosine', 'euclidean'])
        matching_tokenizer = Categorical("matching_tokenizer",
                                         ['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
        matching_vectorizer = Categorical("matching_vectorizer",
                                          ['tfidf', 'tf', 'boolean'])
        clusteror = Categorical("clusteror_name",
                                ["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])

        cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
                                matching_tokenizer, matching_vectorizer, clusteror])
        return cs

    def train(self, config: Configuration, seed: int = 0) -> float:
        indicators = er_process(config)
        return 1-indicators['performance']


def ml_er_hpo():
    classifier = Classifier()
    cs = classifier.configspace
    str_configspace = csj.write(cs)
    dict_configspace = json.loads(str_configspace)
    with open(hpo_output_dir + r"\configspace.json", "w") as f:
        json.dump(dict_configspace, f, indent=4)

    scenario = Scenario(
        cs,
        deterministic=True,
        n_trials=50,  # We want to run max 50 trials (combination of config and seed)
        n_workers=1
    )

    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

    smac = HyperparameterOptimizationFacade(
        scenario,
        classifier.train,
        initial_design=initial_design,
        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
    )

    incumbent = smac.optimize()
    incumbent_cost = smac.validate(incumbent)
    default = cs.get_default_configuration()
    default_cost = smac.validate(default)
    print(f"Default Cost: {default_cost}")
    print(f"Incumbent Cost: {incumbent_cost}")

    if incumbent_cost > default_cost:
        incumbent = default
        print(f"Updated Incumbent Cost: {default_cost}")

    print(f"Optimized Configuration:{incumbent.values()}")

    with open(hpo_output_dir + r"\incumbent.json", "w") as f:
        json.dump(dict(incumbent), f, indent=4)
    return incumbent


if __name__ == '__main__':
    ml_er_hpo()
fuck 1 year ago			`import pandas as pd`
			`import json`
			`from time import *`
			`from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float`
			`from ConfigSpace.read_and_write import json as csj`
			`from smac import HyperparameterOptimizationFacade, Scenario`
			`from settings import *`
			`from ml_er.ml_entity_resolver import er_process`
			`from settings import ltable_path, ltable_id`


			`class Classifier:`
			`@property`
			`def configspace(self) -> ConfigurationSpace:`
			`cs = ConfigurationSpace(seed=0)`
			`ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1', sep='\|')`
			`selected_attrs = ltable.columns.values.tolist()`
			`block_attr_items = selected_attrs[:]`
			`block_attr_items.remove(ltable_id)`

			`jed_blocker = Categorical("jed_blocker",`
			`["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])`
			`block_attr = Categorical("block_attr", block_attr_items)`
			`# filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8)`
			`meta_blocker = Categorical("meta_blocker",`
			`["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])`
			`weighting_scheme = Categorical("weighting_scheme",`
			`['CN-CBS', 'CBS', 'SN-CBS', 'CNC', 'SNC', 'SND', 'CND', 'CNJ', 'SNJ',`
			`'COSINE', 'DICE', 'ECBS', 'JS', 'EJS', 'X2'])`

			`# todo other hyperparameters`
			`matching_metric = Categorical("matching_metric",`
			`['cosine', 'euclidean'])`
			`matching_tokenizer = Categorical("matching_tokenizer",`
			`['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])`
			`matching_vectorizer = Categorical("matching_vectorizer",`
			`['tfidf', 'tf', 'boolean'])`
			`clusteror = Categorical("clusteror_name",`
			`["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])`

			`cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,`
			`matching_tokenizer, matching_vectorizer, clusteror])`
			`return cs`

			`def train(self, config: Configuration, seed: int = 0) -> float:`
			`indicators = er_process(config)`
			`return 1-indicators['performance']`


			`def ml_er_hpo():`
			`classifier = Classifier()`
			`cs = classifier.configspace`
			`str_configspace = csj.write(cs)`
			`dict_configspace = json.loads(str_configspace)`
			`with open(hpo_output_dir + r"\configspace.json", "w") as f:`
			`json.dump(dict_configspace, f, indent=4)`

			`scenario = Scenario(`
			`cs,`
			`deterministic=True,`
			`n_trials=50, # We want to run max 50 trials (combination of config and seed)`
			`n_workers=1`
			`)`

			`initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)`

			`smac = HyperparameterOptimizationFacade(`
			`scenario,`
			`classifier.train,`
			`initial_design=initial_design,`
			`overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state`
			`)`

			`incumbent = smac.optimize()`
			`incumbent_cost = smac.validate(incumbent)`
			`default = cs.get_default_configuration()`
			`default_cost = smac.validate(default)`
			`print(f"Default Cost: {default_cost}")`
			`print(f"Incumbent Cost: {incumbent_cost}")`

			`if incumbent_cost > default_cost:`
			`incumbent = default`
			`print(f"Updated Incumbent Cost: {default_cost}")`

			`print(f"Optimized Configuration:{incumbent.values()}")`

			`with open(hpo_output_dir + r"\incumbent.json", "w") as f:`
			`json.dump(dict(incumbent), f, indent=4)`
			`return incumbent`


			`if __name__ == '__main__':`
			`ml_er_hpo()`