|
|
|
@ -1,20 +1,50 @@
|
|
|
|
|
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
|
|
|
|
|
from ConfigSpace.conditions import InCondition
|
|
|
|
|
import py_entitymatching as em
|
|
|
|
|
import py_entitymatching.catalog.catalog_manager as cm
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from py_entitymatching.blocker.blocker import Blocker
|
|
|
|
|
from py_entitymatching.matcher.mlmatcher import MLMatcher
|
|
|
|
|
|
|
|
|
|
from smac import HyperparameterOptimizationFacade, Scenario
|
|
|
|
|
from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
|
|
|
|
|
from entrance import *
|
|
|
|
|
# todo 距离度量用户可设置
|
|
|
|
|
|
|
|
|
|
# 全局变量,每次迭代后清空列表,加入新的md路径
|
|
|
|
|
# todo:
|
|
|
|
|
# 默认路径为 "../md_discovery/output/xxx.txt"
|
|
|
|
|
# 真阳/假阴 mds/vio 共4个md文件
|
|
|
|
|
# 数据在外部加载
|
|
|
|
|
########################################################################################################################
|
|
|
|
|
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
|
|
|
|
|
cm.set_key(ltable, ltable_id)
|
|
|
|
|
ltable.fillna("", inplace=True)
|
|
|
|
|
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
|
|
|
|
|
cm.set_key(rtable, rtable_id)
|
|
|
|
|
rtable.fillna("", inplace=True)
|
|
|
|
|
mappings = pd.read_csv(mapping_path)
|
|
|
|
|
|
|
|
|
|
lid_mapping_list = []
|
|
|
|
|
rid_mapping_list = []
|
|
|
|
|
# 全部转为字符串
|
|
|
|
|
ltable = ltable.astype(str)
|
|
|
|
|
rtable = rtable.astype(str)
|
|
|
|
|
mappings = mappings.astype(str)
|
|
|
|
|
matching_number = len(mappings) # 所有阳性样本数,商品数据集应为1300
|
|
|
|
|
|
|
|
|
|
for index, row in mappings.iterrows():
|
|
|
|
|
lid_mapping_list.append(row[mapping_lid])
|
|
|
|
|
rid_mapping_list.append(row[mapping_rid])
|
|
|
|
|
# 仅保留两表中出现在映射表中的行,增大正样本比例
|
|
|
|
|
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
|
|
|
|
|
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
|
|
|
|
|
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
|
|
|
|
|
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
|
|
|
|
|
attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
|
|
|
|
|
attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
|
|
|
|
|
cm.set_key(selected_ltable, ltable_id)
|
|
|
|
|
cm.set_key(selected_rtable, rtable_id)
|
|
|
|
|
########################################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_test():
|
|
|
|
|
block_attr_items = selected_attrs[:]
|
|
|
|
|
block_attr_items.remove(rtable_id)
|
|
|
|
|
print(block_attr_items)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
|
|
|
|
@ -80,54 +110,27 @@ def is_explicable(row, all_mds: list) -> bool:
|
|
|
|
|
return False # 遍历结束,不能解释
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SVM:
|
|
|
|
|
class Classifier:
|
|
|
|
|
@property
|
|
|
|
|
def configspace(self) -> ConfigurationSpace:
|
|
|
|
|
# Build Configuration Space which defines all parameters and their ranges
|
|
|
|
|
cs = ConfigurationSpace(seed=0)
|
|
|
|
|
# todo
|
|
|
|
|
# block_attr 取消打桩
|
|
|
|
|
block_attr = Categorical("block_attr", ["name", "description", "manufacturer", "price"], default="title")
|
|
|
|
|
block_attr_items = selected_attrs[:]
|
|
|
|
|
block_attr_items.remove(rtable_id)
|
|
|
|
|
|
|
|
|
|
block_attr = Categorical("block_attr", block_attr_items)
|
|
|
|
|
overlap_size = Integer("overlap_size", (1, 3), default=1)
|
|
|
|
|
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
|
|
|
|
|
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
|
|
|
|
|
# todo 其他可调参数(如feature table删去某列)
|
|
|
|
|
|
|
|
|
|
use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
|
|
|
|
|
cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
|
|
|
|
|
cs.add_conditions([use_overlap_size])
|
|
|
|
|
return cs
|
|
|
|
|
|
|
|
|
|
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估
|
|
|
|
|
def train(self, config: Configuration) -> float:
|
|
|
|
|
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
|
|
|
|
|
cm.set_key(ltable, ltable_id)
|
|
|
|
|
ltable.fillna("", inplace=True)
|
|
|
|
|
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
|
|
|
|
|
cm.set_key(rtable, rtable_id)
|
|
|
|
|
rtable.fillna("", inplace=True)
|
|
|
|
|
mappings = pd.read_csv(mapping_path)
|
|
|
|
|
|
|
|
|
|
# 仅保留两表中出现在映射表中的行,增大正样本比例
|
|
|
|
|
lid_mapping_list = []
|
|
|
|
|
rid_mapping_list = []
|
|
|
|
|
# 全部转为字符串
|
|
|
|
|
ltable = ltable.astype(str)
|
|
|
|
|
rtable = rtable.astype(str)
|
|
|
|
|
mappings = mappings.astype(str)
|
|
|
|
|
matching_number = len(mappings) # 所有阳性样本数,商品数据集应为1300
|
|
|
|
|
|
|
|
|
|
for index, row in mappings.iterrows():
|
|
|
|
|
lid_mapping_list.append(row[mapping_lid])
|
|
|
|
|
rid_mapping_list.append(row[mapping_rid])
|
|
|
|
|
|
|
|
|
|
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
|
|
|
|
|
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
|
|
|
|
|
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
|
|
|
|
|
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
|
|
|
|
|
attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
|
|
|
|
|
attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
|
|
|
|
|
cm.set_key(selected_ltable, ltable_id)
|
|
|
|
|
cm.set_key(selected_rtable, rtable_id)
|
|
|
|
|
|
|
|
|
|
blocker = None
|
|
|
|
|
def train(self, config: Configuration, seed: int = 0) -> float:
|
|
|
|
|
if config["ml_blocker"] == "over_lap":
|
|
|
|
|
blocker = em.OverlapBlocker()
|
|
|
|
|
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
|
|
|
|
@ -195,6 +198,7 @@ class SVM:
|
|
|
|
|
attrs_after=['gold'],
|
|
|
|
|
show_progress=False)
|
|
|
|
|
|
|
|
|
|
# todo 属性名解耦
|
|
|
|
|
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
|
|
|
|
|
attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer',
|
|
|
|
|
'ltable_price', 'rtable_name', 'rtable_description',
|
|
|
|
@ -225,6 +229,8 @@ class SVM:
|
|
|
|
|
predictions_attrs.extend(['gold', 'predicted'])
|
|
|
|
|
predictions = predictions[predictions_attrs]
|
|
|
|
|
|
|
|
|
|
# 默认路径为 "../md_discovery/output/xxx.txt"
|
|
|
|
|
# 真阳/假阴 mds/vio 共4个md文件
|
|
|
|
|
md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt',
|
|
|
|
|
'../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt']
|
|
|
|
|
epl_match = 0 # 可解释,预测match
|
|
|
|
@ -244,7 +250,7 @@ class SVM:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
classifier = SVM()
|
|
|
|
|
classifier = Classifier()
|
|
|
|
|
|
|
|
|
|
# Next, we create an object, holding general information about the run
|
|
|
|
|
scenario = Scenario(
|
|
|
|
|