|
|
|
@ -1,7 +1,7 @@
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import json
|
|
|
|
|
from time import *
|
|
|
|
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
|
|
|
|
from ConfigSpace import EqualsCondition
|
|
|
|
|
from ConfigSpace.read_and_write import json as csj
|
|
|
|
|
from smac import HyperparameterOptimizationFacade, Scenario
|
|
|
|
|
from settings import *
|
|
|
|
@ -20,8 +20,12 @@ class Classifier:
|
|
|
|
|
|
|
|
|
|
jed_blocker = Categorical("jed_blocker",
|
|
|
|
|
["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
|
|
|
|
|
qgrams = Integer('qgrams', (3, 10), default=6)
|
|
|
|
|
use_qgrams = EqualsCondition(child=qgrams, parent=jed_blocker, value="QGrams")
|
|
|
|
|
block_attr = Categorical("block_attr", block_attr_items)
|
|
|
|
|
# filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8)
|
|
|
|
|
|
|
|
|
|
block_filtering_ratio = Float("block_filtering_ratio", (0.7, 0.95), default=0.8)
|
|
|
|
|
|
|
|
|
|
meta_blocker = Categorical("meta_blocker",
|
|
|
|
|
["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
|
|
|
|
|
weighting_scheme = Categorical("weighting_scheme",
|
|
|
|
@ -35,11 +39,14 @@ class Classifier:
|
|
|
|
|
['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
|
|
|
|
|
matching_vectorizer = Categorical("matching_vectorizer",
|
|
|
|
|
['tfidf', 'tf', 'boolean'])
|
|
|
|
|
matching_sim_thresh = Float("similarity_threshold", (0.05, 0.9))
|
|
|
|
|
clusteror = Categorical("clusteror_name",
|
|
|
|
|
["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
|
|
|
|
|
["CCC", "UMC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
|
|
|
|
|
|
|
|
|
|
cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
|
|
|
|
|
matching_tokenizer, matching_vectorizer, clusteror])
|
|
|
|
|
matching_tokenizer, matching_vectorizer, clusteror, qgrams, block_filtering_ratio,
|
|
|
|
|
matching_sim_thresh])
|
|
|
|
|
cs.add_conditions([use_qgrams])
|
|
|
|
|
return cs
|
|
|
|
|
|
|
|
|
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
|
|
@ -57,6 +64,7 @@ def ml_er_hpo():
|
|
|
|
|
|
|
|
|
|
scenario = Scenario(
|
|
|
|
|
cs,
|
|
|
|
|
crash_cost=1,
|
|
|
|
|
deterministic=True,
|
|
|
|
|
n_trials=50, # We want to run max 50 trials (combination of config and seed)
|
|
|
|
|
n_workers=1
|
|
|
|
|