parent
747ee968b2
commit
d79453e5c3
@ -0,0 +1 @@
|
|||||||
|
/md_discovery/output/mds.txt
|
@ -0,0 +1,84 @@
|
|||||||
|
import json
|
||||||
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
||||||
|
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
|
||||||
|
from ConfigSpace.read_and_write import json as csj
|
||||||
|
from smac import Scenario, HyperparameterOptimizationFacade
|
||||||
|
|
||||||
|
from ml_er.deepmatcher_er import matching
|
||||||
|
from setting import hpo_output_dir
|
||||||
|
|
||||||
|
|
||||||
|
class Optimization:
|
||||||
|
@property
|
||||||
|
def configspace(self) -> ConfigurationSpace:
|
||||||
|
cs = ConfigurationSpace(seed=0)
|
||||||
|
|
||||||
|
attr_summarizer = Categorical('attr_summarizer', ['sif', 'rnn', 'attention', 'hybrid'], default='hybrid')
|
||||||
|
attr_comparator = Categorical('attr_comparator', ['concat', 'diff', 'abs-diff', 'concat-diff', 'concat-abs-diff', 'mul'])
|
||||||
|
word_contextualizer = Categorical('word_contextualizer', ['gru', 'lstm', 'rnn', 'self-attention'])
|
||||||
|
word_comparator = Categorical('word_comparator', ['decomposable-attention', 'general-attention', 'dot-attention'])
|
||||||
|
word_aggregator = Categorical('word_aggregator', ['avg-pool', 'divsqrt-pool', 'inv-freq-avg-pool',
|
||||||
|
'sif-pool', 'max-pool', 'last-pool', 'last-simple-pool',
|
||||||
|
'birnn-last-pool', 'birnn-last-simple-pool', 'attention-with-rnn'])
|
||||||
|
classifier_layers = Integer('classifier_layers', (1, 4))
|
||||||
|
classifier_nonlinear = Categorical('classifier_nonlinear', ['leaky_relu', 'relu', 'elu', 'selu', 'glu', 'tanh', 'sigmoid'])
|
||||||
|
classifier_bypass = Categorical('classifier_bypass', ['residual', 'highway'])
|
||||||
|
embeddings = Categorical('embeddings', ['fasttext.en.bin', 'fasttext.wiki.vec', 'fasttext.crawl.vec',
|
||||||
|
'glove.6B.300d', 'glove.42B.300d', 'glove.840B.300d'])
|
||||||
|
|
||||||
|
cs.add_hyperparameters([attr_comparator, attr_summarizer, word_comparator, word_aggregator, word_contextualizer,
|
||||||
|
classifier_bypass, classifier_nonlinear, classifier_layers, embeddings])
|
||||||
|
|
||||||
|
return cs
|
||||||
|
|
||||||
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
||||||
|
indicators = matching(config)
|
||||||
|
return 1 - indicators['performance']
|
||||||
|
|
||||||
|
|
||||||
|
def ml_er_hpo():
|
||||||
|
optimization = Optimization()
|
||||||
|
cs = optimization.configspace
|
||||||
|
str_configspace = csj.write(cs)
|
||||||
|
dict_configspace = json.loads(str_configspace)
|
||||||
|
# 将超参数空间保存本地
|
||||||
|
with open(hpo_output_dir + "configspace.json", "w") as f:
|
||||||
|
json.dump(dict_configspace, f, indent=4)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
cs,
|
||||||
|
crash_cost=1.0,
|
||||||
|
deterministic=True,
|
||||||
|
n_trials=20,
|
||||||
|
n_workers=1
|
||||||
|
)
|
||||||
|
|
||||||
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
||||||
|
|
||||||
|
smac = HyperparameterOptimizationFacade(
|
||||||
|
scenario,
|
||||||
|
optimization.train,
|
||||||
|
initial_design=initial_design,
|
||||||
|
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
||||||
|
)
|
||||||
|
|
||||||
|
incumbent = smac.optimize()
|
||||||
|
incumbent_cost = smac.validate(incumbent)
|
||||||
|
default = cs.get_default_configuration()
|
||||||
|
default_cost = smac.validate(default)
|
||||||
|
print(f"Default Cost: {default_cost}")
|
||||||
|
print(f"Incumbent Cost: {incumbent_cost}")
|
||||||
|
|
||||||
|
if incumbent_cost > default_cost:
|
||||||
|
incumbent = default
|
||||||
|
print(f"Updated Incumbent Cost: {default_cost}")
|
||||||
|
|
||||||
|
print(f"Optimized Configuration:{incumbent.values()}")
|
||||||
|
|
||||||
|
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
||||||
|
json.dump(dict(incumbent), f, indent=4)
|
||||||
|
return incumbent
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
ml_er_hpo()
|
@ -0,0 +1,12 @@
|
|||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
directory_path = r'E:\Data\Research\Datasets\DeepMatcher dataset\Textual\Abt-Buy'
|
||||||
|
|
||||||
|
er_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\ml_er\output'
|
||||||
|
md_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\md_discovery\output'
|
||||||
|
hpo_output_dir = r'E:\Data\Research\Projects\md_bayesian_er_deepmatcher\hpo\output'
|
||||||
|
|
||||||
|
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
|
||||||
|
interpre_weight = 0 # 可解释性权重
|
||||||
|
support_threshold = 1
|
||||||
|
confidence_threshold = 0.75
|
@ -0,0 +1,29 @@
|
|||||||
|
# train/valid/test set中只有左右id映射, 该方法根据左右表以及id构建完整的set
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from setting import directory_path
|
||||||
|
|
||||||
|
|
||||||
|
def build_whole_X_set(X_set, _ltable, _rtable):
|
||||||
|
merged_set = pd.merge(X_set, _ltable, on='ltable_id', how='left')
|
||||||
|
merged_set = pd.merge(merged_set, _rtable, on='rtable_id', how='left')
|
||||||
|
merged_set.insert(0, '_id', range(len(merged_set)))
|
||||||
|
return merged_set
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 读入两张表, 加前缀
|
||||||
|
ltable = pd.read_csv(directory_path + r'\tableA.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'ltable_{x}')
|
||||||
|
rtable = pd.read_csv(directory_path + r'\tableB.csv', encoding='ISO-8859-1').rename(columns=lambda x: f'rtable_{x}')
|
||||||
|
|
||||||
|
train = pd.read_csv(directory_path + r'\train.csv', encoding='ISO-8859-1')
|
||||||
|
valid = pd.read_csv(directory_path + r'\valid.csv', encoding='ISO-8859-1')
|
||||||
|
test = pd.read_csv(directory_path + r'\test.csv', encoding='ISO-8859-1')
|
||||||
|
|
||||||
|
train = build_whole_X_set(train, ltable, rtable)
|
||||||
|
valid = build_whole_X_set(valid, ltable, rtable)
|
||||||
|
test = build_whole_X_set(test, ltable, rtable)
|
||||||
|
|
||||||
|
train.to_csv(directory_path + r'\train_whole.csv', sep=',', index=False, header=True)
|
||||||
|
valid.to_csv(directory_path + r'\valid_whole.csv', sep=',', index=False, header=True)
|
||||||
|
test.to_csv(directory_path + r'\test_whole.csv', sep=',', index=False, header=True)
|
Loading…
Reference in new issue