HuangJintao
HuangJintao 1 year ago
parent c349768eaf
commit e18295838a

@ -1,29 +1,28 @@
# this is the entrance of the auto-ER procedure # this is the entrance of the auto-ER procedure
from md_discovery.script.md_discover import md_discover from md_discovery.script.md_discover import md_discover
from ml_er.ml_entity_resolver import ml_er
from hpo.er_model_hpo import ml_er_hpo
from settings import *
ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv' def run(rounds: int):
rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv' hp_config = None
mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv' # while The termination condition is not met
mapping_lid = 'idAmazon' # mapping表中左表id名 iter_round = 1
mapping_rid = 'idGoogleBase' # mapping表中右表id名 for i in range(0, rounds):
ltable_id = 'id' # 左表id字段名称 ml_er(iter_round, hp_config)
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
similarity_threshold = 0.7
confidence_threshold = 0.8
interpretability_weight = 0.3
def run(l_table_path, r_table_path, mapping_path):
# while The termination condition is not met:
while True:
# er()
md_discover() md_discover()
# hpo() hp_config = ml_er_hpo()
iter_round += 1
ml_er(iter_round, hp_config)
return return
if __name__ == '__main__': if __name__ == '__main__':
# todo 距离度量用户可设置? path = 'md_discovery/output'
# todo
# 距离度量用户可设置?
# 使用drop删除特征向量中的列(如删除id相关特征)
run(1)
# ml_er(1)
print(ltable_path) print(ltable_path)

@ -1,4 +1,5 @@
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float import os
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer
from ConfigSpace.conditions import InCondition from ConfigSpace.conditions import InCondition
import py_entitymatching as em import py_entitymatching as em
import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.catalog.catalog_manager as cm
@ -6,15 +7,13 @@ import pandas as pd
from smac import HyperparameterOptimizationFacade, Scenario from smac import HyperparameterOptimizationFacade, Scenario
from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
from entrance import * from settings import *
# 数据在外部加载 # 数据在外部加载
######################################################################################################################## ########################################################################################################################
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id)
ltable.fillna("", inplace=True) ltable.fillna("", inplace=True)
rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
cm.set_key(rtable, rtable_id)
rtable.fillna("", inplace=True) rtable.fillna("", inplace=True)
mappings = pd.read_csv(mapping_path) mappings = pd.read_csv(mapping_path)
@ -32,21 +31,12 @@ for index, row in mappings.iterrows():
# 仅保留两表中出现在映射表中的行,增大正样本比例 # 仅保留两表中出现在映射表中的行,增大正样本比例
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
tables_id = rtable_id # 不论左表右表ID字段名是否一致经上一行调整统一以右表为准
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
cm.set_key(selected_ltable, ltable_id)
cm.set_key(selected_rtable, rtable_id)
######################################################################################################################## ########################################################################################################################
def test_test():
block_attr_items = selected_attrs[:]
block_attr_items.remove(rtable_id)
print(block_attr_items)
def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
test_proportion: float) -> dict: test_proportion: float) -> dict:
new_df = df.reset_index(drop=False, inplace=False) new_df = df.reset_index(drop=False, inplace=False)
@ -82,6 +72,8 @@ def load_mds(paths: list) -> list:
all_mds = [] all_mds = []
# 传入md路径列表 # 传入md路径列表
for md_path in paths: for md_path in paths:
if not os.path.exists(md_path):
continue
mds = [] mds = []
# 打开每一个md文件 # 打开每一个md文件
with open(md_path, 'r') as f: with open(md_path, 'r') as f:
@ -102,7 +94,7 @@ def is_explicable(row, all_mds: list) -> bool:
explicable = True # 假设这条md能解释当前元组 explicable = True # 假设这条md能解释当前元组
for a in attrs: for a in attrs:
threshold = md[a] threshold = md[a]
if my_Levenshtein_ratio(str(getattr(row, 'ltable_'+a)), str(getattr(row, 'rtable_'+a))) < threshold: if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold:
explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组 explicable = False # 任意一个字段的相似度达不到阈值这条md就不能解释当前元组
break # 不再与当前md的其他相似度阈值比较跳转到下一条md break # 不再与当前md的其他相似度阈值比较跳转到下一条md
if explicable: if explicable:
@ -116,13 +108,12 @@ class Classifier:
# Build Configuration Space which defines all parameters and their ranges # Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace(seed=0) cs = ConfigurationSpace(seed=0)
block_attr_items = selected_attrs[:] block_attr_items = selected_attrs[:]
block_attr_items.remove(rtable_id) block_attr_items.remove(tables_id)
block_attr = Categorical("block_attr", block_attr_items) block_attr = Categorical("block_attr", block_attr_items)
overlap_size = Integer("overlap_size", (1, 3), default=1) overlap_size = Integer("overlap_size", (1, 3), default=1)
ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf") ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
# todo 其他可调参数(如feature table删去某列)
use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"]) use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker]) cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
@ -131,6 +122,11 @@ class Classifier:
# train 就是整个函数 只需将返回结果由预测变成预测结果的评估 # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
def train(self, config: Configuration, seed: int = 0) -> float: def train(self, config: Configuration, seed: int = 0) -> float:
attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀
attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀
cm.set_key(selected_ltable, tables_id)
cm.set_key(selected_rtable, tables_id)
if config["ml_blocker"] == "over_lap": if config["ml_blocker"] == "over_lap":
blocker = em.OverlapBlocker() blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
@ -145,13 +141,13 @@ class Classifier:
candidate_match_rows = [] candidate_match_rows = []
for index, row in candidate.iterrows(): for index, row in candidate.iterrows():
l_id = row['ltable_' + ltable_id] l_id = row['ltable_' + tables_id]
map_row = mappings[mappings[mapping_lid] == l_id] map_row = mappings[mappings[mapping_lid] == l_id]
if map_row is not None: if map_row is not None:
r_id = map_row[mapping_rid] r_id = map_row[mapping_rid]
for value in r_id: for value in r_id:
if value == row['rtable_' + rtable_id]: if value == row['rtable_' + tables_id]:
candidate_match_rows.append(row["_id"]) candidate_match_rows.append(row["_id"])
else: else:
continue continue
@ -165,9 +161,12 @@ class Classifier:
candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match)) candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match))
# 拼接正负样本 # 拼接正负样本
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match]) candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
if len(candidate_for_train_test) == 0:
return 1
cm.set_key(candidate_for_train_test, '_id') cm.set_key(candidate_for_train_test, '_id')
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id) cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id) cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
cm.set_ltable(candidate_for_train_test, selected_ltable) cm.set_ltable(candidate_for_train_test, selected_ltable)
cm.set_rtable(candidate_for_train_test, selected_rtable) cm.set_rtable(candidate_for_train_test, selected_rtable)
@ -178,7 +177,18 @@ class Classifier:
train_set = sets['train'] train_set = sets['train']
test_set = sets['test'] test_set = sets['test']
matcher = None cm.set_key(train_set, '_id')
cm.set_fk_ltable(train_set, 'ltable_' + tables_id)
cm.set_fk_rtable(train_set, 'rtable_' + tables_id)
cm.set_ltable(train_set, selected_ltable)
cm.set_rtable(train_set, selected_rtable)
cm.set_key(test_set, '_id')
cm.set_fk_ltable(test_set, 'ltable_' + tables_id)
cm.set_fk_rtable(test_set, 'rtable_' + tables_id)
cm.set_ltable(test_set, selected_ltable)
cm.set_rtable(test_set, selected_rtable)
if config["ml_matcher"] == "dt": if config["ml_matcher"] == "dt":
matcher = em.DTMatcher(name='DecisionTree', random_state=0) matcher = em.DTMatcher(name='DecisionTree', random_state=0)
elif config["ml_matcher"] == "svm": elif config["ml_matcher"] == "svm":
@ -198,25 +208,21 @@ class Classifier:
attrs_after=['gold'], attrs_after=['gold'],
show_progress=False) show_progress=False)
# todo 属性名解耦 test_feature_after = attrs_with_l_prefix[:]
test_feature_after.extend(attrs_with_r_prefix)
for _ in test_feature_after:
if _.endswith(tables_id):
test_feature_after.remove(_)
test_feature_after.append('gold')
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer', attrs_after=test_feature_after, show_progress=False)
'ltable_price', 'rtable_name', 'rtable_description',
'rtable_manufacturer', 'rtable_price', 'gold'], show_progress=False) fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
# todo 参数可调 用drop删除特征向量中的列
# 1.exclude_attrs test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
# 去掉id相关的相似度 predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
matcher.fit(table=train_feature_vecs, append=True, target_attr='predicted', inplace=False)
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
target_attr='gold')
# 1.exclude_attrs
predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_title',
'ltable_description', 'ltable_manufacturer',
'ltable_price', 'rtable_name', 'rtable_description',
'rtable_manufacturer', 'rtable_price', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'gold', 'predicted') eval_result = em.eval_matches(predictions, 'gold', 'predicted')
em.print_eval_summary(eval_result) em.print_eval_summary(eval_result)
indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, test_proportion) indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, test_proportion)
@ -231,34 +237,42 @@ class Classifier:
# 默认路径为 "../md_discovery/output/xxx.txt" # 默认路径为 "../md_discovery/output/xxx.txt"
# 真阳/假阴 mds/vio 共4个md文件 # 真阳/假阴 mds/vio 共4个md文件
md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt', md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
'../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt'] 'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
epl_match = 0 # 可解释预测match epl_match = 0 # 可解释预测match
nepl_mismatch = 0 # 不可解释预测mismatch nepl_mismatch = 0 # 不可解释预测mismatch
md_list = load_mds(md_paths) # 从全局变量中读取所有的md md_list = load_mds(md_paths) # 从全局变量中读取所有的md
for row in predictions.itertuples(): if len(md_list) > 0:
if is_explicable(row, md_list): for row in predictions.itertuples():
if getattr(row, 'predicted') == 1: if is_explicable(row, md_list):
epl_match += 1 if getattr(row, 'predicted') == 1:
else: epl_match += 1
if getattr(row, 'predicted') == 0: else:
nepl_mismatch += 1 if getattr(row, 'predicted') == 0:
epl_ability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性 nepl_mismatch += 1
f1 = indicators['F1'] interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
performance = interpretability_weight * epl_ability + (1 - interpretability_weight) * f1 # if indicators["my_recall"] >= 0.8:
# f1 = indicators["F1"]
# else:
# f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
if indicators["my_recall"] < 0.8:
return 1
f1 = indicators["F1"]
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
return 1 - performance return 1 - performance
if __name__ == "__main__": def ml_er_hpo():
classifier = Classifier() classifier = Classifier()
# Next, we create an object, holding general information about the run # Next, we create an object, holding general information about the run
scenario = Scenario( scenario = Scenario(
classifier.configspace, classifier.configspace,
n_trials=12, # We want to run max 50 trials (combination of config and seed) deterministic=True,
n_trials=10, # We want to run max 50 trials (combination of config and seed)
) )
initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=3) initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
# Now we use SMAC to find the best hyperparameters # Now we use SMAC to find the best hyperparameters
smac = HyperparameterOptimizationFacade( smac = HyperparameterOptimizationFacade(
@ -268,9 +282,6 @@ if __name__ == "__main__":
overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state
) )
# todo
# 如果new_recall过低则避免其成为最优解
# 将损失函数置为1/用new_recall降低F1从而提高损失函数
incumbent = smac.optimize() incumbent = smac.optimize()
# Get cost of default configuration # Get cost of default configuration
@ -280,6 +291,6 @@ if __name__ == "__main__":
# Let's calculate the cost of the incumbent # Let's calculate the cost of the incumbent
incumbent_cost = smac.validate(incumbent) incumbent_cost = smac.validate(incumbent)
print(f"Incumbent cost: {incumbent_cost}") print(f"Incumbent cost: {incumbent_cost}")
print(f"Configuration:{incumbent.values()}") print(f"Configuration:{incumbent.values()}")
print(f"MAX_F1:{1-classifier.train(incumbent)}")
return incumbent

@ -1,7 +1,7 @@
import time import time
from md_discovery.functions.multi_process_infer_by_pairs import inference_from_record_pairs from md_discovery.functions.multi_process_infer_by_pairs import inference_from_record_pairs
from md_discovery.functions.multi_process_infer_by_pairs import get_mds_metadata from md_discovery.functions.multi_process_infer_by_pairs import get_mds_metadata
from entrance import * from settings import *
# # 若不输出support和confidence使用以下两块代码 # # 若不输出support和confidence使用以下两块代码
# # 将列表1写入本地路径需自己修改 # # 将列表1写入本地路径需自己修改
@ -19,8 +19,8 @@ from entrance import *
def md_discover(): def md_discover():
# 目前可以仿照这个main函数写 # 目前可以仿照这个main函数写
tp_single_tuple_path = "../../ml_er/output/tp_single_tuple.csv" tp_single_tuple_path = "ml_er/output/tp_single_tuple.csv"
fn_single_tuple_path = "../../ml_er/output/fn_single_tuple.csv" fn_single_tuple_path = "ml_er/output/fn_single_tuple.csv"
# 输入csv文件路径md左侧相似度阈值md右侧目标字段 # 输入csv文件路径md左侧相似度阈值md右侧目标字段
# 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) # 输出2个md列表列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8)
# 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段
@ -36,8 +36,8 @@ def md_discover():
# 若输出support和confidence使用以下两块代码 # 若输出support和confidence使用以下两块代码
# 将列表1写入本地路径需自己修改 # 将列表1写入本地路径需自己修改
tp_mds_path = "../output/tp_mds.txt" tp_mds_path = "md_discovery/output/tp_mds.txt"
tp_vio_path = "../output/tp_vio.txt" tp_vio_path = "md_discovery/output/tp_vio.txt"
with open(tp_mds_path, 'w') as f: with open(tp_mds_path, 'w') as f:
for _ in tp_mds_meta: for _ in tp_mds_meta:
@ -51,8 +51,8 @@ def md_discover():
f.write(i + ':' + str(_[i]) + '\t') f.write(i + ':' + str(_[i]) + '\t')
f.write('\n') f.write('\n')
fn_mds_path = "../output/fn_mds.txt" fn_mds_path = "md_discovery/output/fn_mds.txt"
fn_vio_path = "../output/fn_vio.txt" fn_vio_path = "md_discovery/output/fn_vio.txt"
with open(fn_mds_path, 'w') as f: with open(fn_mds_path, 'w') as f:
for _ in fn_mds_meta: for _ in fn_mds_meta:

@ -1,3 +1,4 @@
import os
import sys import sys
from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric
@ -11,19 +12,18 @@ import time
import six import six
from ConfigSpace import Configuration from ConfigSpace import Configuration
from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
from entrance import * from settings import *
from hpo.magellan_hpo import incumbent
def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "output/tp_single_tuple.csv", fn_single_tuple_path: str = "output/fn_single_tuple.csv"): def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "ml_er/output/tp_single_tuple.csv", fn_single_tuple_path: str = "ml_er/output/fn_single_tuple.csv"):
# 提取预测表中真阳和假阴部分 # 提取预测表中真阳和假阴部分
tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)] tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)]
fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)] fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)]
# 将真阳/假阴表中左右ID调整一致 # 将真阳/假阴表中左右ID调整一致
for index, row in tp.iterrows(): for index, row in tp.iterrows():
tp.loc[index, "rtable_id"] = row["ltable_id"] tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
for index, row in fn.iterrows(): for index, row in fn.iterrows():
fn.loc[index, "rtable_id"] = row["ltable_id"] fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id]
pred_columns = pred.columns.values.tolist() pred_columns = pred.columns.values.tolist()
l_columns = [] l_columns = []
@ -93,6 +93,8 @@ def load_mds(paths: list) -> list:
all_mds = [] all_mds = []
# 传入md路径列表 # 传入md路径列表
for md_path in paths: for md_path in paths:
if not os.path.exists(md_path):
continue
mds = [] mds = []
# 打开每一个md文件 # 打开每一个md文件
with open(md_path, 'r') as f: with open(md_path, 'r') as f:
@ -137,14 +139,10 @@ def load_data(left_path: str, right_path: str, mapping_path: str):
return left, right, mapping return left, right, mapping
def ml_er(config: Configuration): def ml_er(iter_round: int, config: Configuration = None, ):
# todo: # todo:
# if config is not None -> load configs # if config is not None -> load configs
# else use default configs # else -> use default configs
# 1. block_attr
# 2. overlap_size
# 3. ml_matcher
# 4. ml_blocker
ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
cm.set_key(ltable, ltable_id) cm.set_key(ltable, ltable_id)
ltable.fillna("", inplace=True) ltable.fillna("", inplace=True)
@ -168,28 +166,58 @@ def ml_er(config: Configuration):
selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段
tables_id = rtable_id
selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名
items_but_id = selected_attrs[:]
items_but_id.remove(tables_id) # 两张表中除了id的字段名
attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs] attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs] attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
cm.set_key(selected_ltable, ltable_id) cm.set_key(selected_ltable, tables_id)
cm.set_key(selected_rtable, rtable_id) cm.set_key(selected_rtable, tables_id)
if config is not None:
ml_matcher = config["ml_matcher"]
if ml_matcher == "dt":
matcher = em.DTMatcher(name='DecisionTree', random_state=0)
elif ml_matcher == "svm":
matcher = em.SVMMatcher(name='SVM', random_state=0)
elif ml_matcher == "rf":
matcher = em.RFMatcher(name='RF', random_state=0)
elif ml_matcher == "lg":
matcher = em.LogRegMatcher(name='LogReg', random_state=0)
elif ml_matcher == "ln":
matcher = em.LinRegMatcher(name='LinReg')
elif ml_matcher == "nb":
matcher = em.NBMatcher(name='NaiveBayes')
if config["ml_blocker"] == "over_lap":
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=config["overlap_size"], show_progress=False)
elif config["ml_blocker"] == "attr_equiv":
blocker = em.AttrEquivalenceBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1)
else:
matcher = em.RFMatcher(name='RF', random_state=0)
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0],
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=1, show_progress=False)
blocker = em.OverlapBlocker()
candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name',
l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,
overlap_size=1, show_progress=False)
candidate['gold'] = 0 candidate['gold'] = 0
candidate_match_rows = [] candidate_match_rows = []
for index, row in candidate.iterrows(): for index, row in candidate.iterrows():
l_id = row['ltable_' + ltable_id] l_id = row['ltable_' + tables_id]
map_row = mappings[mappings[mapping_lid] == l_id] map_row = mappings[mappings[mapping_lid] == l_id]
if map_row is not None: if map_row is not None:
r_id = map_row[mapping_rid] r_id = map_row[mapping_rid]
for value in r_id: for value in r_id:
if value == row['rtable_' + rtable_id]: if value == row['rtable_' + tables_id]:
candidate_match_rows.append(row["_id"]) candidate_match_rows.append(row["_id"])
else: else:
continue continue
@ -204,19 +232,18 @@ def ml_er(config: Configuration):
# 拼接正负样本 # 拼接正负样本
candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match]) candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match])
cm.set_key(candidate_for_train_test, '_id') cm.set_key(candidate_for_train_test, '_id')
cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id) cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id)
cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id) cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id)
cm.set_ltable(candidate_for_train_test, selected_ltable) cm.set_ltable(candidate_for_train_test, selected_ltable)
cm.set_rtable(candidate_for_train_test, selected_rtable) cm.set_rtable(candidate_for_train_test, selected_rtable)
# 分为训练测试集 # 分为训练测试集
train_proportion = 0.7 train_proportion = 0.7
test_proportion = 0.3 test_proportion = 0.3
sets = em.split_train_test(candidate_for_train_test, train_proportion=0.7, random_state=0) sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0)
train_set = sets['train'] train_set = sets['train']
test_set = sets['test'] test_set = sets['test']
rf = em.RFMatcher(name='RF', random_state=0)
feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False) feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False)
train_feature_vecs = em.extract_feature_vecs(train_set, train_feature_vecs = em.extract_feature_vecs(train_set,
@ -224,20 +251,19 @@ def ml_er(config: Configuration):
attrs_after=['gold'], attrs_after=['gold'],
show_progress=False) show_progress=False)
test_feature_after = attrs_with_l_prefix[:]
test_feature_after.extend(attrs_with_r_prefix)
for _ in test_feature_after:
if _.endswith(tables_id):
test_feature_after.remove(_)
test_feature_after.append('gold')
test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
attrs_after=['ltable_name', 'ltable_description', 'ltable_manufacturer', attrs_after=test_feature_after, show_progress=False)
'ltable_price', 'rtable_name', 'rtable_description',
'rtable_manufacturer', 'rtable_price', 'gold'], fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold']
show_progress=False) matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold')
test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id])
rf.fit(table=train_feature_vecs, predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
target_attr='gold')
predictions = rf.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_name',
'ltable_description', 'ltable_manufacturer',
'ltable_price', 'rtable_name',
'rtable_description',
'rtable_manufacturer', 'rtable_price', 'gold'],
append=True, target_attr='predicted', inplace=False) append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'gold', 'predicted') eval_result = em.eval_matches(predictions, 'gold', 'predicted')
em.print_eval_summary(eval_result) em.print_eval_summary(eval_result)
@ -252,28 +278,35 @@ def ml_er(config: Configuration):
predictions_attrs.extend(['gold', 'predicted']) predictions_attrs.extend(['gold', 'predicted'])
predictions = predictions[predictions_attrs] predictions = predictions[predictions_attrs]
md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt',
'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt']
epl_match = 0 # 可解释预测match epl_match = 0 # 可解释预测match
nepl_mismatch = 0 # 不可解释预测mismatch nepl_mismatch = 0 # 不可解释预测mismatch
p_md = "/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt"
p_vio = "/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt"
md_paths: list = [p_md, p_vio]
md_list = load_mds(md_paths) # 从全局变量中读取所有的md
for row in predictions.itertuples():
if is_explicable(row, md_list):
if getattr(row, 'predicted') == 1:
epl_match += 1
else:
if getattr(row, 'predicted') == 0:
nepl_mismatch += 1
epl_ability = (epl_match + nepl_mismatch) / len(predictions) md_list = load_mds(md_paths) # 从全局变量中读取所有的md
if len(md_list) > 0:
for row in predictions.itertuples():
if is_explicable(row, md_list):
if getattr(row, 'predicted') == 1:
epl_match += 1
else:
if getattr(row, 'predicted') == 0:
nepl_mismatch += 1
interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性
if indicators["my_recall"] >= 0.8:
f1 = indicators["F1"]
else:
f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"])
performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
################################################################################################################ ################################################################################################################
process_prediction_for_md_discovery(predictions) process_prediction_for_md_discovery(predictions)
output_path = "output/eval_result" + str(time.time()) + ".txt" output_path = "ml_er/output/eval_result_" + str(iter_round) + ".txt"
with open(output_path, 'w') as f: with open(output_path, 'w') as f:
for key, value in six.iteritems(_get_metric(eval_result)): for key, value in six.iteritems(_get_metric(eval_result)):
f.write(key + " : " + value) f.write(key + " : " + value)
f.write('\n') f.write('\n')
f.write('my_recall:' + str(indicators["my_recall"])) f.write('my_recall:' + str(indicators["my_recall"]) + '\n')
f.write('\n') f.write('interpretability:' + str(interpretability) + '\n')
f.write('performance:' + str(performance) + '\n')

@ -0,0 +1,12 @@
ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
mapping_lid = 'idAmazon' # mapping表中左表id名
mapping_rid = 'idGoogleBase' # mapping表中右表id名
ltable_id = 'id' # 左表id字段名称
rtable_id = 'id' # 右表id字段名称
target_attr = 'id' # 进行md挖掘时的目标字段
lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致
similarity_threshold = 0.7
confidence_threshold = 0.8
interpre_weight = 0.3 # 可解释性权重
Loading…
Cancel
Save