|
|
|
@ -1,5 +1,4 @@
|
|
|
|
|
import os
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
|
|
|
|
|
from ConfigSpace.conditions import InCondition
|
|
|
|
@ -8,8 +7,8 @@ import py_entitymatching.catalog.catalog_manager as cm
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
from smac import HyperparameterOptimizationFacade, Scenario
|
|
|
|
|
from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
|
|
|
|
|
from settings import *
|
|
|
|
|
from ml_er.ml_entity_resolver import evaluate_prediction, load_mds, is_explicable
|
|
|
|
|
|
|
|
|
|
# 数据在外部加载
|
|
|
|
|
########################################################################################################################
|
|
|
|
@ -39,71 +38,6 @@ selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字
|
|
|
|
|
########################################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, couple_number: int,
|
|
|
|
|
test_proportion: float) -> dict:
|
|
|
|
|
new_df = df.reset_index(drop=False, inplace=False)
|
|
|
|
|
gold = new_df[labeled_attr]
|
|
|
|
|
predicted = new_df[predicted_attr]
|
|
|
|
|
gold_negative = gold[gold == 0].index.values
|
|
|
|
|
gold_positive = gold[gold == 1].index.values
|
|
|
|
|
predicted_negative = predicted[predicted == 0].index.values
|
|
|
|
|
predicted_positive = predicted[predicted == 1].index.values
|
|
|
|
|
|
|
|
|
|
false_positive_indices = list(set(gold_negative).intersection(predicted_positive))
|
|
|
|
|
true_positive_indices = list(set(gold_positive).intersection(predicted_positive))
|
|
|
|
|
false_negative_indices = list(set(gold_positive).intersection(predicted_negative))
|
|
|
|
|
|
|
|
|
|
num_true_positives = float(len(true_positive_indices))
|
|
|
|
|
num_false_positives = float(len(false_positive_indices))
|
|
|
|
|
num_false_negatives = float(len(false_negative_indices))
|
|
|
|
|
|
|
|
|
|
precision_denominator = num_true_positives + num_false_positives
|
|
|
|
|
recall_denominator = num_true_positives + num_false_negatives
|
|
|
|
|
|
|
|
|
|
precision = 0.0 if precision_denominator == 0.0 else num_true_positives / precision_denominator
|
|
|
|
|
recall = 0.0 if recall_denominator == 0.0 else num_true_positives / recall_denominator
|
|
|
|
|
F1 = 0.0 if precision == 0.0 and recall == 0.0 else (2.0 * precision * recall) / (precision + recall)
|
|
|
|
|
my_recall = num_true_positives / (couple_number * test_proportion)
|
|
|
|
|
|
|
|
|
|
return {"precision": precision, "recall": recall, "F1": F1, "my_recall": my_recall}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_mds(paths: list) -> list:
|
|
|
|
|
if len(paths) == 0:
|
|
|
|
|
return []
|
|
|
|
|
all_mds = []
|
|
|
|
|
# 传入md路径列表
|
|
|
|
|
for md_path in paths:
|
|
|
|
|
if not os.path.exists(md_path):
|
|
|
|
|
continue
|
|
|
|
|
mds = []
|
|
|
|
|
# 打开每一个md文件
|
|
|
|
|
with open(md_path, 'r') as f:
|
|
|
|
|
# 读取每一行的md,加入该文件的md列表
|
|
|
|
|
for line in f.readlines():
|
|
|
|
|
md_metadata = line.strip().split('\t')
|
|
|
|
|
md = eval(md_metadata[0].replace('md:', ''))
|
|
|
|
|
confidence = eval(md_metadata[2].replace('confidence:', ''))
|
|
|
|
|
if confidence > 0:
|
|
|
|
|
mds.append(md)
|
|
|
|
|
all_mds.extend(mds)
|
|
|
|
|
return all_mds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_explicable(line, all_mds: list) -> bool:
|
|
|
|
|
attrs = all_mds[0].keys() # 从第一条md中读取所有字段
|
|
|
|
|
for md in all_mds:
|
|
|
|
|
explicable = True # 假设这条md能解释当前元组
|
|
|
|
|
for a in attrs:
|
|
|
|
|
threshold = md[a]
|
|
|
|
|
if my_Levenshtein_ratio(str(getattr(line, 'ltable_' + a)), str(getattr(line, 'rtable_' + a))) < threshold:
|
|
|
|
|
explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组
|
|
|
|
|
break # 不再与当前md的其他相似度阈值比较,跳转到下一条md
|
|
|
|
|
if explicable:
|
|
|
|
|
return True # 任意一条md能解释,直接返回
|
|
|
|
|
return False # 遍历结束,不能解释
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Classifier:
|
|
|
|
|
@property
|
|
|
|
|
def configspace(self) -> ConfigurationSpace:
|
|
|
|
@ -273,7 +207,7 @@ def ml_er_hpo():
|
|
|
|
|
classifier.configspace,
|
|
|
|
|
deterministic=True,
|
|
|
|
|
n_trials=10, # We want to run max 50 trials (combination of config and seed)
|
|
|
|
|
n_workers=2
|
|
|
|
|
n_workers=1
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
|
|
|
|
|