parent
747ee968b2
commit
d79453e5c3
@ -0,0 +1 @@
|
||||
/md_discovery/output/mds.txt
|
@ -0,0 +1,84 @@
|
||||
import json
|
||||
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
||||
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
|
||||
from ConfigSpace.read_and_write import json as csj
|
||||
from smac import Scenario, HyperparameterOptimizationFacade
|
||||
|
||||
from ml_er.deepmatcher_er import matching
|
||||
from setting import hpo_output_dir
|
||||
|
||||
|
||||
class Optimization:
|
||||
@property
|
||||
def configspace(self) -> ConfigurationSpace:
|
||||
cs = ConfigurationSpace(seed=0)
|
||||
|
||||
attr_summarizer = Categorical('attr_summarizer', ['sif', 'rnn', 'attention', 'hybrid'], default='hybrid')
|
||||
attr_comparator = Categorical('attr_comparator', ['concat', 'diff', 'abs-diff', 'concat-diff', 'concat-abs-diff', 'mul'])
|
||||
word_contextualizer = Categorical('word_contextualizer', ['gru', 'lstm', 'rnn', 'self-attention'])
|
||||
word_comparator = Categorical('word_comparator', ['decomposable-attention', 'general-attention', 'dot-attention'])
|
||||
word_aggregator = Categorical('word_aggregator', ['avg-pool', 'divsqrt-pool', 'inv-freq-avg-pool',
|
||||
'sif-pool', 'max-pool', 'last-pool', 'last-simple-pool',
|
||||
'birnn-last-pool', 'birnn-last-simple-pool', 'attention-with-rnn'])
|
||||
classifier_layers = Integer('classifier_layers', (1, 4))
|
||||
classifier_nonlinear = Categorical('classifier_nonlinear', ['leaky_relu', 'relu', 'elu', 'selu', 'glu', 'tanh', 'sigmoid'])
|
||||
classifier_bypass = Categorical('classifier_bypass', ['residual', 'highway'])
|
||||
embeddings = Categorical('embeddings', ['fasttext.en.bin', 'fasttext.wiki.vec', 'fasttext.crawl.vec',
|
||||
'glove.6B.300d', 'glove.42B.300d', 'glove.840B.300d'])
|
||||
|
||||
cs.add_hyperparameters([attr_comparator, attr_summarizer, word_comparator, word_aggregator, word_contextualizer,
|
||||
classifier_bypass, classifier_nonlinear, classifier_layers, embeddings])
|
||||
|
||||
return cs
|
||||
|
||||
def train(self, config: Configuration, seed: int = 0) -> float:
|
||||
indicators = matching(config)
|
||||
return 1 - indicators['performance']
|
||||
|
||||
|
||||
def ml_er_hpo():
|
||||
optimization = Optimization()
|
||||
cs = optimization.configspace
|
||||
str_configspace = csj.write(cs)
|
||||
dict_configspace = json.loads(str_configspace)
|
||||
# 将超参数空间保存本地
|
||||
with open(hpo_output_dir + "configspace.json", "w") as f:
|
||||
json.dump(dict_configspace, f, indent=4)
|
||||
|
||||
scenario = Scenario(
|
||||
cs,
|
||||
crash_cost=1.0,
|
||||
deterministic=True,
|
||||
n_trials=20,
|
||||
n_workers=1
|
||||
)
|
||||
|
||||
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
||||
|
||||
smac = HyperparameterOptimizationFacade(
|
||||
scenario,
|
||||
optimization.train,
|
||||
initial_design=initial_design,
|
||||
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
||||
)
|
||||
|
||||
incumbent = smac.optimize()
|
||||
incumbent_cost = smac.validate(incumbent)
|
||||
default = cs.get_default_configuration()
|
||||
default_cost = smac.validate(default)
|
||||
print(f"Default Cost: {default_cost}")
|
||||
print(f"Incumbent Cost: {incumbent_cost}")
|
||||
|
||||
if incumbent_cost > default_cost:
|
||||
incumbent = default
|
||||
print(f"Updated Incumbent Cost: {default_cost}")
|
||||
|
||||
print(f"Optimized Configuration:{incumbent.values()}")
|
||||
|
||||
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
||||
json.dump(dict(incumbent), f, indent=4)
|
||||
return incumbent
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ml_er_hpo()
|
@ -0,0 +1,12 @@
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Textual\Abt-Buy'
|
||||
|
||||
er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\ml_er\output'
|
||||
md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\md_discovery\output'
|
||||
hpo_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\hpo\output'
|
||||
|
||||
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
|
||||
interpre_weight = 0 # 可解释性权重
|
||||
support_threshold = 1
|
||||
confidence_threshold = 0.75
|
@ -0,0 +1,29 @@
|
||||
# train/valid/test set中只有左右id映射, 该方法根据左右表以及id构建完整的set
|
||||
import pandas as pd
|
||||
|
||||
from setting import directory_path
|
||||
|
||||
|
||||
def build_whole_X_set(X_set, _ltable, _rtable):
|
||||
merged_set = pd.merge(X_set, _ltable, on='ltable_id', how='left')
|
||||
merged_set = pd.merge(merged_set, _rtable, on='rtable_id', how='left')
|
||||
merged_set.insert(0, '_id', range(len(merged_set)))
|
||||
return merged_set
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 读入两张表, 加前缀
|
||||
ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'ltable_{x}')
|
||||
rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'rtable_{x}')
|
||||
|
||||
train = pd.read_csv(directory_path + r'\train.csv', encoding='ISO-8859-1')
|
||||
valid = pd.read_csv(directory_path + r'\valid.csv', encoding='ISO-8859-1')
|
||||
test = pd.read_csv(directory_path + r'\test.csv', encoding='ISO-8859-1')
|
||||
|
||||
train = build_whole_X_set(train, ltable, rtable)
|
||||
valid = build_whole_X_set(valid, ltable, rtable)
|
||||
test = build_whole_X_set(test, ltable, rtable)
|
||||
|
||||
train.to_csv(directory_path + r'\train_whole.csv', sep=',', index=False, header=True)
|
||||
valid.to_csv(directory_path + r'\valid_whole.csv', sep=',', index=False, header=True)
|
||||
test.to_csv(directory_path + r'\test_whole.csv', sep=',', index=False, header=True)
|
Loading…
Reference in new issue