Compare commits
2 Commits
b21b0aa496
...
c1e07eabb6
Author | SHA1 | Date |
---|---|---|
HuangJintao | c1e07eabb6 | 7 months ago |
HuangJintao | 9b06ce3840 | 7 months ago |
@ -1,21 +0,0 @@
|
|||||||
import csv
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import json
|
|
||||||
import sentence_transformers.util
|
|
||||||
import torch
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
directory = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM_dirty'
|
|
||||||
train = pd.read_csv(directory + r'\train.csv', encoding='ISO-8859-1')
|
|
||||||
valid = pd.read_csv(directory + r'\valid.csv', encoding='ISO-8859-1')
|
|
||||||
test = pd.read_csv(directory + r'\test.csv', encoding='ISO-8859-1')
|
|
||||||
train = train[train['label'] == 1]
|
|
||||||
valid = valid[valid['label'] == 1]
|
|
||||||
test = test[test['label'] == 1]
|
|
||||||
matches = pd.concat([train, valid, test])
|
|
||||||
matches.drop(columns=['label'], inplace=True)
|
|
||||||
matches = matches.sort_values(by='ltable_id')
|
|
||||||
matches.to_csv(directory + r'\matches.csv', sep=',', index=False, header=True)
|
|
@ -1,73 +0,0 @@
|
|||||||
import json
|
|
||||||
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
|
||||||
from ConfigSpace.conditions import InCondition
|
|
||||||
from ConfigSpace.read_and_write import json as csj
|
|
||||||
import py_entitymatching.catalog.catalog_manager as cm
|
|
||||||
import pandas as pd
|
|
||||||
from smac import HyperparameterOptimizationFacade, Scenario
|
|
||||||
from settings import *
|
|
||||||
from ml_er.ml_entity_resolver import er_process
|
|
||||||
|
|
||||||
|
|
||||||
class Classifier:
|
|
||||||
@property
|
|
||||||
def configspace(self) -> ConfigurationSpace:
|
|
||||||
cs = ConfigurationSpace(seed=0)
|
|
||||||
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
|
|
||||||
# todo 每个分类器的超参数
|
|
||||||
tree_criterion = Categorical("dt_criterion", ["gini", "entropy", "log_loss"], default="gini")
|
|
||||||
|
|
||||||
|
|
||||||
cs.add_hyperparameters([ml_matcher])
|
|
||||||
return cs
|
|
||||||
|
|
||||||
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
||||||
cm.del_catalog()
|
|
||||||
indicators = er_process(config)
|
|
||||||
return 1-indicators['performance']
|
|
||||||
|
|
||||||
|
|
||||||
def ml_er_hpo():
|
|
||||||
classifier = Classifier()
|
|
||||||
cs = classifier.configspace
|
|
||||||
str_configspace = csj.write(cs)
|
|
||||||
dict_configspace = json.loads(str_configspace)
|
|
||||||
with open(hpo_output_dir + "configspace.json", "w") as f:
|
|
||||||
json.dump(dict_configspace, f, indent=4)
|
|
||||||
|
|
||||||
scenario = Scenario(
|
|
||||||
cs,
|
|
||||||
deterministic=True,
|
|
||||||
n_trials=12, # We want to run max 50 trials (combination of config and seed)
|
|
||||||
n_workers=1
|
|
||||||
)
|
|
||||||
|
|
||||||
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
|
||||||
|
|
||||||
smac = HyperparameterOptimizationFacade(
|
|
||||||
scenario,
|
|
||||||
classifier.train,
|
|
||||||
initial_design=initial_design,
|
|
||||||
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
|
||||||
)
|
|
||||||
|
|
||||||
incumbent = smac.optimize()
|
|
||||||
incumbent_cost = smac.validate(incumbent)
|
|
||||||
default = cs.get_default_configuration()
|
|
||||||
default_cost = smac.validate(default)
|
|
||||||
print(f"Default Cost: {default_cost}")
|
|
||||||
print(f"Incumbent Cost: {incumbent_cost}")
|
|
||||||
|
|
||||||
if incumbent_cost > default_cost:
|
|
||||||
incumbent = default
|
|
||||||
print(f"Updated Incumbent Cost: {default_cost}")
|
|
||||||
|
|
||||||
print(f"Optimized Configuration:{incumbent.values()}")
|
|
||||||
|
|
||||||
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
|
||||||
json.dump(dict(incumbent), f, indent=4)
|
|
||||||
return incumbent
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
ml_er_hpo()
|
|
@ -0,0 +1,115 @@
|
|||||||
|
import json
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
||||||
|
from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
|
||||||
|
from ConfigSpace.read_and_write import json as csj
|
||||||
|
import py_entitymatching.catalog.catalog_manager as cm
|
||||||
|
import pandas as pd
|
||||||
|
from smac import HyperparameterOptimizationFacade, Scenario
|
||||||
|
|
||||||
|
from ml_er.magellan_new import matching
|
||||||
|
from settings import *
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier:
|
||||||
|
@property
|
||||||
|
def configspace(self) -> ConfigurationSpace:
|
||||||
|
cs = ConfigurationSpace(seed=0)
|
||||||
|
|
||||||
|
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf"])
|
||||||
|
# note 以tree开头的超参数是DT和RF共用的
|
||||||
|
tree_criterion = Categorical("tree_criterion", ["gini", "entropy", "log_loss"], default="gini")
|
||||||
|
rf_n_estimators = Integer('number_of_tree', (10, 150))
|
||||||
|
tree_max_depth = Integer('tree_max_depth', (15, 30), default=None)
|
||||||
|
rf_max_features = Categorical('rf_max_features', ["sqrt", "log2", "auto"], default='sqrt')
|
||||||
|
|
||||||
|
svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')
|
||||||
|
svm_C = Integer('svm_C', (1, 100), default=1)
|
||||||
|
svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')
|
||||||
|
svm_degree = Integer('svm_degree', (1, 3), default=3)
|
||||||
|
svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)
|
||||||
|
|
||||||
|
dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')
|
||||||
|
dt_max_features = Categorical('dt_max_features', ["auto", "sqrt", "log2"], default=None)
|
||||||
|
|
||||||
|
cs.add_hyperparameters([ml_matcher, tree_criterion, rf_n_estimators, tree_max_depth, rf_max_features,
|
||||||
|
svm_kernel, svm_C, svm_gamma, svm_degree, svm_constant, dt_splitter, dt_max_features])
|
||||||
|
|
||||||
|
active_tree_criterion = InCondition(child=tree_criterion, parent=ml_matcher, values=['dt', 'rf'])
|
||||||
|
active_tree_max_depth = InCondition(child=tree_max_depth, parent=ml_matcher, values=['dt', 'rf'])
|
||||||
|
active_rf_n_estimators = EqualsCondition(child=rf_n_estimators, parent=ml_matcher, value="rf")
|
||||||
|
active_rf_max_features = EqualsCondition(child=rf_max_features, parent=ml_matcher, value="rf")
|
||||||
|
active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")
|
||||||
|
active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")
|
||||||
|
active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
|
||||||
|
active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
|
||||||
|
|
||||||
|
active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"])
|
||||||
|
active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly")
|
||||||
|
active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"])
|
||||||
|
|
||||||
|
cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators,
|
||||||
|
active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion,
|
||||||
|
AndConjunction(active_svm_gamma1, active_svm_gamma2),
|
||||||
|
AndConjunction(active_svm_degree1, active_svm_degree2),
|
||||||
|
AndConjunction(active_svm_constant1, active_svm_constant2)])
|
||||||
|
return cs
|
||||||
|
|
||||||
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
||||||
|
cm.del_catalog()
|
||||||
|
with open(er_output_dir + "blocking_result.pickle", "rb") as file:
|
||||||
|
blocking_result = pickle.load(file)
|
||||||
|
indicators = matching(config, blocking_result)
|
||||||
|
return 1 - indicators['performance']
|
||||||
|
|
||||||
|
|
||||||
|
def ml_er_hpo():
|
||||||
|
classifier = Classifier()
|
||||||
|
cs = classifier.configspace
|
||||||
|
str_configspace = csj.write(cs)
|
||||||
|
dict_configspace = json.loads(str_configspace)
|
||||||
|
# 将超参数空间保存本地
|
||||||
|
with open(hpo_output_dir + "configspace.json", "w") as f:
|
||||||
|
json.dump(dict_configspace, f, indent=4)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
cs,
|
||||||
|
crash_cost=1.0,
|
||||||
|
deterministic=True,
|
||||||
|
n_trials=20,
|
||||||
|
n_workers=1
|
||||||
|
)
|
||||||
|
|
||||||
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
||||||
|
|
||||||
|
smac = HyperparameterOptimizationFacade(
|
||||||
|
scenario,
|
||||||
|
classifier.train,
|
||||||
|
initial_design=initial_design,
|
||||||
|
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
|
||||||
|
)
|
||||||
|
|
||||||
|
incumbent = smac.optimize()
|
||||||
|
incumbent_cost = smac.validate(incumbent)
|
||||||
|
default = cs.get_default_configuration()
|
||||||
|
default_cost = smac.validate(default)
|
||||||
|
print(f"Default Cost: {default_cost}")
|
||||||
|
print(f"Incumbent Cost: {incumbent_cost}")
|
||||||
|
|
||||||
|
if incumbent_cost > default_cost:
|
||||||
|
incumbent = default
|
||||||
|
print(f"Updated Incumbent Cost: {default_cost}")
|
||||||
|
|
||||||
|
print(f"Optimized Configuration:{incumbent.values()}")
|
||||||
|
|
||||||
|
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
||||||
|
json.dump(dict(incumbent), f, indent=4)
|
||||||
|
return incumbent
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
ml_er_hpo()
|
@ -0,0 +1,4 @@
|
|||||||
|
from ml_er.magellan_new import blocking_mining
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
blocking_mining()
|
@ -1,27 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
lt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv',
|
|
||||||
encoding='ISO-8859-1', sep=',')
|
|
||||||
rt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv',
|
|
||||||
encoding='ISO-8859-1', sep=',')
|
|
||||||
mapping = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
|
|
||||||
encoding='ISO-8859-1', sep=',')
|
|
||||||
# 删除列
|
|
||||||
# lt.drop(columns=['numberOfSeasons', 'numberOfEpisodes', 'birthDate', 'last_air_date', 'release_year', 'runtime',
|
|
||||||
# 'genre_list', 'origin_country'], inplace=True)
|
|
||||||
# rt.drop(columns=['job'], inplace=True)
|
|
||||||
# 列名调整
|
|
||||||
# rt = rt.rename(columns={'birthDate': 'birthYear', 'release_year': 'startYear',
|
|
||||||
# 'last_air_date': 'endYear', 'runtime': 'runtimeMinutes'})
|
|
||||||
# 列顺序调整
|
|
||||||
# rt = rt[['id', 'title', 'name', 'episodeNumber', 'seasonNumber', 'birthYear', 'endYear', 'startYear', 'genre_list',
|
|
||||||
# 'runtimeMinutes']]
|
|
||||||
# lt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv', sep=',', index=False, header=True, quoting=1)
|
|
||||||
# rt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv', sep=',', index=False, header=True, quoting=1)
|
|
||||||
# mapping.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
|
|
||||||
# sep=',', index=False, header=True, quoting=1)
|
|
||||||
print(1)
|
|
||||||
|
|
||||||
|
|
@ -1,24 +1,24 @@
|
|||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableA.csv'
|
ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv'
|
||||||
rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv'
|
rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
|
||||||
mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\matches.csv'
|
mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
|
||||||
mapping_lid = 'idDBLP' # mapping表中左表id名
|
mapping_lid = 'idAbt' # mapping表中左表id名
|
||||||
mapping_rid = 'idScholar' # mapping表中右表id名
|
mapping_rid = 'idBuy' # mapping表中右表id名
|
||||||
ltable_block_attr = 'title'
|
ltable_block_attr = 'name'
|
||||||
rtable_block_attr = 'title'
|
rtable_block_attr = 'name'
|
||||||
ltable_id = 'id' # 左表id字段名称
|
ltable_id = 'id' # 左表id字段名称
|
||||||
rtable_id = 'id' # 右表id字段名称
|
rtable_id = 'id' # 右表id字段名称
|
||||||
target_attr = 'id' # 进行md挖掘时的目标字段
|
target_attr = 'id' # 进行md挖掘时的目标字段
|
||||||
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
|
# lr_attrs_map = {} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
|
||||||
|
|
||||||
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
|
model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
|
||||||
interpre_weight = 1 # 可解释性权重
|
interpre_weight = 0 # 可解释性权重
|
||||||
similarity_threshold = 0.1
|
similarity_threshold = 0.1
|
||||||
support_threshold = 1
|
support_threshold = 1
|
||||||
confidence_threshold = 0.75
|
confidence_threshold = 0.75
|
||||||
|
|
||||||
er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
|
er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
|
||||||
md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
|
md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
|
||||||
hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
|
hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'
|
||||||
|
|
||||||
|
@ -1,13 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
|
|
||||||
table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
|
|
||||||
model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
|
|
||||||
encoding = tokenizer(table, return_tensors="pt")
|
|
||||||
embedding = tokenizer.encode(table, return_tensors="pt")
|
|
||||||
print(1)
|
|
@ -1,159 +0,0 @@
|
|||||||
import json
|
|
||||||
import multiprocessing
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ConfigSpace
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import torch
|
|
||||||
from tqdm import tqdm
|
|
||||||
from ConfigSpace.read_and_write import json as csj
|
|
||||||
from md_discovery import discovery_executor
|
|
||||||
from settings import er_output_dir, hpo_output_dir
|
|
||||||
|
|
||||||
|
|
||||||
def fuck(i):
|
|
||||||
i = i * i + 1
|
|
||||||
|
|
||||||
|
|
||||||
def test1():
|
|
||||||
li = [[[6, 6, 2],
|
|
||||||
[2, 4, 6],
|
|
||||||
[2, 4, 7],
|
|
||||||
[3, 6, 4]],
|
|
||||||
[[6, 2, 7],
|
|
||||||
[3, 2, 4],
|
|
||||||
[5, 3, 5],
|
|
||||||
[6, 2, 4]],
|
|
||||||
[[7, 2, 2],
|
|
||||||
[6, 3, 2],
|
|
||||||
[6, 4, 3],
|
|
||||||
[6, 5, 6]]]
|
|
||||||
tensor = torch.Tensor(li)
|
|
||||||
norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
|
|
||||||
print(norm_tensor, '\n')
|
|
||||||
sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
|
|
||||||
print(sim_ten / 2 + 0.5, '\n')
|
|
||||||
print(sim_ten.size())
|
|
||||||
|
|
||||||
|
|
||||||
def test2():
|
|
||||||
multiprocessing.set_start_method("spawn")
|
|
||||||
manager = multiprocessing.Manager()
|
|
||||||
lock = manager.Lock()
|
|
||||||
pool = multiprocessing.Pool(16)
|
|
||||||
with manager:
|
|
||||||
for _ in tqdm(range(0, 1000)):
|
|
||||||
result = pool.apply_async(fuck, args=(_,))
|
|
||||||
print(result)
|
|
||||||
|
|
||||||
|
|
||||||
def test3():
|
|
||||||
dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
|
|
||||||
ll = list(dic.values())
|
|
||||||
ten = torch.Tensor(ll)
|
|
||||||
t = ten.unsqueeze(1)
|
|
||||||
t = t.unsqueeze(2)
|
|
||||||
y = t.repeat(1, 742, 742)
|
|
||||||
print(ten)
|
|
||||||
print(y)
|
|
||||||
print(torch.isfinite(ten))
|
|
||||||
print(torch.count_nonzero(y).item())
|
|
||||||
|
|
||||||
|
|
||||||
def test4():
|
|
||||||
one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
|
|
||||||
print(torch.count_nonzero(one_bool_tensor).item())
|
|
||||||
|
|
||||||
|
|
||||||
def test5():
|
|
||||||
ten1 = torch.tensor([[1, 2, 3],
|
|
||||||
[7, 8, 9]])
|
|
||||||
ten2 = torch.tensor([[4, 5, 6],
|
|
||||||
[11, 12, 15]])
|
|
||||||
result = ten1 * ten2
|
|
||||||
r = torch.sum(result, 1)
|
|
||||||
print('\n')
|
|
||||||
print(result)
|
|
||||||
print(r)
|
|
||||||
|
|
||||||
|
|
||||||
def test6():
|
|
||||||
table_tensor = torch.tensor([[[1., 2., 3.],
|
|
||||||
[4., 5., 6.],
|
|
||||||
[7., 8., 9.]],
|
|
||||||
[[1., 2., 3.],
|
|
||||||
[4., 5., 6.],
|
|
||||||
[7., 8., 9.]]])
|
|
||||||
t = torch.tensor([[1., 2., 3.],
|
|
||||||
[4., 5., 6.]])
|
|
||||||
norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
|
|
||||||
norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
|
|
||||||
print('\n')
|
|
||||||
print(norm1)
|
|
||||||
print(norm2)
|
|
||||||
print(t.shape)
|
|
||||||
|
|
||||||
|
|
||||||
def test7():
|
|
||||||
iterations = 1
|
|
||||||
filename_list = os.listdir(er_output_dir)
|
|
||||||
if len(filename_list) > 0:
|
|
||||||
for _ in filename_list:
|
|
||||||
if _.startswith('eval_result'):
|
|
||||||
iterations = int(_[12:13]) + 1
|
|
||||||
print(iterations)
|
|
||||||
|
|
||||||
|
|
||||||
def test8():
|
|
||||||
with open(hpo_output_dir + "configspace.json", 'r') as load_f:
|
|
||||||
dict_configspace = json.load(load_f)
|
|
||||||
str_configspace = json.dumps(dict_configspace)
|
|
||||||
configspace = csj.read(str_configspace)
|
|
||||||
|
|
||||||
|
|
||||||
def test9():
|
|
||||||
df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True)
|
|
||||||
df.to_csv(r'./datasets/s.csv')
|
|
||||||
d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1')
|
|
||||||
print(1)
|
|
||||||
|
|
||||||
|
|
||||||
def test10():
|
|
||||||
rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\ml_er\output\predictions.csv',
|
|
||||||
encoding='ISO-8859-1')
|
|
||||||
print(1)
|
|
||||||
rtable.columns = ["id", "title", "authors", "venue", "year"]
|
|
||||||
rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv',
|
|
||||||
sep=',', index=False, header=True, quoting=1)
|
|
||||||
|
|
||||||
|
|
||||||
def test11():
|
|
||||||
values = {
|
|
||||||
'block_attr': 'class',
|
|
||||||
'confidence_thresh': 0.2717823249253852,
|
|
||||||
'ml_blocker': 'attr_equiv',
|
|
||||||
'ml_matcher': 'ln',
|
|
||||||
'similarity_thresh': 0.20681820299103484,
|
|
||||||
'support_thresh': 129,
|
|
||||||
}
|
|
||||||
with open(hpo_output_dir + "incumbent.json", "w") as f:
|
|
||||||
json.dump(values, f, indent=4)
|
|
||||||
|
|
||||||
|
|
||||||
def test12():
|
|
||||||
with open(hpo_output_dir + "incumbent.json", 'r') as f:
|
|
||||||
dic = json.load(f)
|
|
||||||
for _ in dic.keys():
|
|
||||||
print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}')
|
|
||||||
|
|
||||||
|
|
||||||
def test13():
|
|
||||||
outcome_dir = r'E:\Data\Research\Outcome'
|
|
||||||
configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
|
|
||||||
datasets_list = os.listdir(outcome_dir)
|
|
||||||
f = []
|
|
||||||
for _ in datasets_list:
|
|
||||||
f.append(outcome_dir + rf'\{_}' + configs_dir)
|
|
||||||
print(f)
|
|
Loading…
Reference in new issue