|
|
@ -1,4 +1,6 @@
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
|
|
|
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
|
|
|
|
from ConfigSpace.conditions import InCondition
|
|
|
|
from ConfigSpace.conditions import InCondition
|
|
|
|
import py_entitymatching as em
|
|
|
|
import py_entitymatching as em
|
|
|
@ -37,7 +39,7 @@ selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字
|
|
|
|
########################################################################################################################
|
|
|
|
########################################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
|
|
|
|
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int,
|
|
|
|
test_proportion: float) -> dict:
|
|
|
|
test_proportion: float) -> dict:
|
|
|
|
new_df = df.reset_index(drop=False, inplace=False)
|
|
|
|
new_df = df.reset_index(drop=False, inplace=False)
|
|
|
|
gold = new_df[labeled_attr]
|
|
|
|
gold = new_df[labeled_attr]
|
|
|
@ -61,7 +63,7 @@ def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str
|
|
|
|
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
|
|
|
|
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
|
|
|
|
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
|
|
|
|
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
|
|
|
|
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
|
|
|
|
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
|
|
|
|
my_recall = num_true_positives / (matching_number * test_proportion)
|
|
|
|
my_recall = num_true_positives / (couple_number * test_proportion)
|
|
|
|
|
|
|
|
|
|
|
|
return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall}
|
|
|
|
return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall}
|
|
|
|
|
|
|
|
|
|
|
@ -88,13 +90,13 @@ def load_mds(paths: list) -> list:
|
|
|
|
return all_mds
|
|
|
|
return all_mds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_explicable(row, all_mds: list) -> bool:
|
|
|
|
def is_explicable(line, all_mds: list) -> bool:
|
|
|
|
attrs = all_mds[0].keys() # 从第一条md中读取所有字段
|
|
|
|
attrs = all_mds[0].keys() # 从第一条md中读取所有字段
|
|
|
|
for md in all_mds:
|
|
|
|
for md in all_mds:
|
|
|
|
explicable = True # 假设这条md能解释当前元组
|
|
|
|
explicable = True # 假设这条md能解释当前元组
|
|
|
|
for a in attrs:
|
|
|
|
for a in attrs:
|
|
|
|
threshold = md[a]
|
|
|
|
threshold = md[a]
|
|
|
|
if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold:
|
|
|
|
if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold:
|
|
|
|
explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组
|
|
|
|
explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组
|
|
|
|
break # 不再与当前md的其他相似度阈值比较,跳转到下一条md
|
|
|
|
break # 不再与当前md的其他相似度阈值比较,跳转到下一条md
|
|
|
|
if explicable:
|
|
|
|
if explicable:
|
|
|
@ -122,7 +124,8 @@ class Classifier:
|
|
|
|
|
|
|
|
|
|
|
|
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估
|
|
|
|
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估
|
|
|
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
|
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
|
|
|
|
|
|
# print(f"BRAINFUCK:{config.values()}")
|
|
|
|
|
|
|
|
cm.del_catalog()
|
|
|
|
attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀
|
|
|
|
attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀
|
|
|
|
attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀
|
|
|
|
attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀
|
|
|
|
cm.set_key(selected_ltable, tables_id)
|
|
|
|
cm.set_key(selected_ltable, tables_id)
|
|
|
@ -131,28 +134,28 @@ class Classifier:
|
|
|
|
blocker = em.OverlapBlocker()
|
|
|
|
blocker = em.OverlapBlocker()
|
|
|
|
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
|
|
|
|
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
|
|
|
|
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
|
|
|
|
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
|
|
|
|
overlap_size=config["overlap_size"], show_progress=False, n_jobs=-1)
|
|
|
|
overlap_size=config["overlap_size"], show_progress=False)
|
|
|
|
elif config["ml_blocker"] == "attr_equiv":
|
|
|
|
elif config["ml_blocker"] == "attr_equiv":
|
|
|
|
blocker = em.AttrEquivalenceBlocker()
|
|
|
|
blocker = em.AttrEquivalenceBlocker()
|
|
|
|
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
|
|
|
|
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
|
|
|
|
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1)
|
|
|
|
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs)
|
|
|
|
|
|
|
|
|
|
|
|
candidate['gold'] = 0
|
|
|
|
candidate['gold'] = 0
|
|
|
|
|
|
|
|
|
|
|
|
candidate_match_rows = []
|
|
|
|
candidate_match_rows = []
|
|
|
|
for index, row in candidate.iterrows():
|
|
|
|
for index_num, line in candidate.iterrows():
|
|
|
|
l_id = row['ltable_' + tables_id]
|
|
|
|
l_id = line['ltable_' + tables_id]
|
|
|
|
map_row = mappings[mappings[mapping_lid] == l_id]
|
|
|
|
map_row = mappings[mappings[mapping_lid] == l_id]
|
|
|
|
|
|
|
|
|
|
|
|
if map_row is not None:
|
|
|
|
if map_row is not None:
|
|
|
|
r_id = map_row[mapping_rid]
|
|
|
|
r_id = map_row[mapping_rid]
|
|
|
|
for value in r_id:
|
|
|
|
for value in r_id:
|
|
|
|
if value == row['rtable_' + tables_id]:
|
|
|
|
if value == line['rtable_' + tables_id]:
|
|
|
|
candidate_match_rows.append(row["_id"])
|
|
|
|
candidate_match_rows.append(line["_id"])
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
for row in candidate_match_rows:
|
|
|
|
for line in candidate_match_rows:
|
|
|
|
candidate.loc[row, 'gold'] = 1
|
|
|
|
candidate.loc[line, 'gold'] = 1
|
|
|
|
|
|
|
|
|
|
|
|
# 裁剪负样本,保持正负样本数量一致
|
|
|
|
# 裁剪负样本,保持正负样本数量一致
|
|
|
|
candidate_mismatch = candidate[candidate['gold'] == 0]
|
|
|
|
candidate_mismatch = candidate[candidate['gold'] == 0]
|
|
|
@ -243,12 +246,12 @@ class Classifier:
|
|
|
|
nepl_mismatch = 0 # 不可解释,预测mismatch
|
|
|
|
nepl_mismatch = 0 # 不可解释,预测mismatch
|
|
|
|
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
|
|
|
|
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
|
|
|
|
if len(md_list) > 0:
|
|
|
|
if len(md_list) > 0:
|
|
|
|
for row in predictions.itertuples():
|
|
|
|
for line in predictions.itertuples():
|
|
|
|
if is_explicable(row, md_list):
|
|
|
|
if is_explicable(line, md_list):
|
|
|
|
if getattr(row, 'predicted') == 1:
|
|
|
|
if getattr(line, 'predicted') == 1:
|
|
|
|
epl_match += 1
|
|
|
|
epl_match += 1
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
if getattr(row, 'predicted') == 0:
|
|
|
|
if getattr(line, 'predicted') == 0:
|
|
|
|
nepl_mismatch += 1
|
|
|
|
nepl_mismatch += 1
|
|
|
|
interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
|
|
|
|
interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
|
|
|
|
# if indicators["my_recall"] >= 0.8:
|
|
|
|
# if indicators["my_recall"] >= 0.8:
|
|
|
@ -270,6 +273,7 @@ def ml_er_hpo():
|
|
|
|
classifier.configspace,
|
|
|
|
classifier.configspace,
|
|
|
|
deterministic=True,
|
|
|
|
deterministic=True,
|
|
|
|
n_trials=10, # We want to run max 50 trials (combination of config and seed)
|
|
|
|
n_trials=10, # We want to run max 50 trials (combination of config and seed)
|
|
|
|
|
|
|
|
n_workers=2
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
|
|
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
|
|
@ -291,6 +295,6 @@ def ml_er_hpo():
|
|
|
|
# Let's calculate the cost of the incumbent
|
|
|
|
# Let's calculate the cost of the incumbent
|
|
|
|
incumbent_cost = smac.validate(incumbent)
|
|
|
|
incumbent_cost = smac.validate(incumbent)
|
|
|
|
print(f"Incumbent cost: {incumbent_cost}")
|
|
|
|
print(f"Incumbent cost: {incumbent_cost}")
|
|
|
|
print(f"Configuration:{incumbent.values()}")
|
|
|
|
print(f"Optimized_Configuration:{incumbent.values()}")
|
|
|
|
|
|
|
|
|
|
|
|
return incumbent
|
|
|
|
return incumbent
|
|
|
|