import os
import numpy as np
import torch
import json
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
from ConfigSpace.conditions import InCondition
from ConfigSpace.read_and_write import json as csj
import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm
import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario
from md_discovery.md_discover import md_discover
from settings import *
from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable, build_col_pairs_sim_tensor_dict, \
process_prediction_for_md_discovery, er_process
from ml_er.ml_entity_resolver import er_process
class Classifier:
block_attr = Categorical("block_attr", block_attr_items)
overlap_size = Integer("overlap_size", (1, 3), default=1)
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
similarity_thresh = Float("similarity_thresh", (0.2, 0.21))
support_thresh = Integer("support_thresh", (1, 1000))
confidence_thresh = Float("confidence_thresh", (0.25, 0.5))
similarity_thresh = Float("similarity_thresh", (0, 0.2), default=0.2)
support_thresh = Integer("support_thresh", (1, 5), default=1)
confidence_thresh = Float("confidence_thresh", (0.3, 0.7), default=0.4)
use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker,
similarity_thresh, support_thresh, confidence_thresh])
cs.add_hyperparameters([block_attr, ml_matcher, ml_blocker, similarity_thresh,
support_thresh, confidence_thresh])
return cs
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估
indicators = er_process(config)
return 1-indicators['performance']
def ml_er_hpo():
classifier = Classifier()
with open(hpo_output_dir + "configspace.json", "w") as f:
json.dump(dict_configspace, f, indent=4)
# Next, we create an object, holding general information about the run
scenario = Scenario(
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
# Now we use SMAC to find the best hyperparameters
smac = HyperparameterOptimizationFacade(