From e18295838abf6d5f546ce4ce5c2c319b7130b2e1 Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Mon, 25 Sep 2023 09:10:21 +0800 Subject: [PATCH] 9.24 --- entrance.py | 37 +++-- hpo/{magellan_hpo.py => er_model_hpo.py} | 131 +++++++++-------- md_discovery/script/md_discover.py | 14 +- ...Goods Dataset.py => ml_entity_resolver.py} | 135 +++++++++++------- settings.py | 12 ++ 5 files changed, 192 insertions(+), 137 deletions(-) rename hpo/{magellan_hpo.py => er_model_hpo.py} (76%) rename ml_er/{Goods Dataset.py => ml_entity_resolver.py} (67%) create mode 100644 settings.py diff --git a/entrance.py b/entrance.py index a0ac5d6..eca4e6a 100644 --- a/entrance.py +++ b/entrance.py @@ -1,29 +1,28 @@ # this is the entrance of the auto-ER procedure from md_discovery.script.md_discover import md_discover +from ml_er.ml_entity_resolver import ml_er +from hpo.er_model_hpo import ml_er_hpo +from settings import * -ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv' -rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv' -mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv' -mapping_lid = 'idAmazon' # mapping表中左表id名 -mapping_rid = 'idGoogleBase' # mapping表中右表id名 -ltable_id = 'id' # 左表id字段名称 -rtable_id = 'id' # 右表id字段名称 -target_attr = 'id' # 进行md挖掘时的目标字段 -lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 -similarity_threshold = 0.7 -confidence_threshold = 0.8 -interpretability_weight = 0.3 - -def run(l_table_path, r_table_path, mapping_path): - # while The termination condition is not met: - while True: - # er() +def run(rounds: int): + hp_config = None + # while The termination condition is not met + iter_round = 1 + for i in range(0, rounds): + ml_er(iter_round, hp_config) md_discover() - # hpo() + hp_config = ml_er_hpo() + iter_round += 1 + ml_er(iter_round, hp_config) return if __name__ == '__main__': - # todo 距离度量用户可设置? + path = 'md_discovery/output' + # todo + # 距离度量用户可设置? + # 使用drop删除特征向量中的列?(如删除id相关特征) + run(1) + # ml_er(1) print(ltable_path) diff --git a/hpo/magellan_hpo.py b/hpo/er_model_hpo.py similarity index 76% rename from hpo/magellan_hpo.py rename to hpo/er_model_hpo.py index ff5775f..395b499 100644 --- a/hpo/magellan_hpo.py +++ b/hpo/er_model_hpo.py @@ -1,4 +1,5 @@ -from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float +import os +from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer from ConfigSpace.conditions import InCondition import py_entitymatching as em import py_entitymatching.catalog.catalog_manager as cm @@ -6,15 +7,13 @@ import pandas as pd from smac import HyperparameterOptimizationFacade, Scenario from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio -from entrance import * +from settings import * # 数据在外部加载 ######################################################################################################################## ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') -cm.set_key(ltable, ltable_id) ltable.fillna("", inplace=True) rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') -cm.set_key(rtable, rtable_id) rtable.fillna("", inplace=True) mappings = pd.read_csv(mapping_path) @@ -32,21 +31,12 @@ for index, row in mappings.iterrows(): # 仅保留两表中出现在映射表中的行,增大正样本比例 selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 +tables_id = rtable_id # 不论左表右表ID字段名是否一致,经上一行调整,统一以右表为准 selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 -attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs] -attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs] -cm.set_key(selected_ltable, ltable_id) -cm.set_key(selected_rtable, rtable_id) ######################################################################################################################## -def test_test(): - block_attr_items = selected_attrs[:] - block_attr_items.remove(rtable_id) - print(block_attr_items) - - def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, test_proportion: float) -> dict: new_df = df.reset_index(drop=False, inplace=False) @@ -82,6 +72,8 @@ def load_mds(paths: list) -> list: all_mds = [] # 传入md路径列表 for md_path in paths: + if not os.path.exists(md_path): + continue mds = [] # 打开每一个md文件 with open(md_path, 'r') as f: @@ -102,7 +94,7 @@ def is_explicable(row, all_mds: list) -> bool: explicable = True # 假设这条md能解释当前元组 for a in attrs: threshold = md[a] - if my_Levenshtein_ratio(str(getattr(row, 'ltable_'+a)), str(getattr(row, 'rtable_'+a))) < threshold: + if my_Levenshtein_ratio(str(getattr(row, 'ltable_' + a)), str(getattr(row, 'rtable_' + a))) < threshold: explicable = False # 任意一个字段的相似度达不到阈值,这条md就不能解释当前元组 break # 不再与当前md的其他相似度阈值比较,跳转到下一条md if explicable: @@ -116,13 +108,12 @@ class Classifier: # Build Configuration Space which defines all parameters and their ranges cs = ConfigurationSpace(seed=0) block_attr_items = selected_attrs[:] - block_attr_items.remove(rtable_id) + block_attr_items.remove(tables_id) block_attr = Categorical("block_attr", block_attr_items) overlap_size = Integer("overlap_size", (1, 3), default=1) ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") - # todo 其他可调参数(如feature table删去某列) use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"]) cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker]) @@ -131,6 +122,11 @@ class Classifier: # train 就是整个函数 只需将返回结果由预测变成预测结果的评估 def train(self, config: Configuration, seed: int = 0) -> float: + + attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs] # 字段名加左前缀 + attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs] # 字段名加右前缀 + cm.set_key(selected_ltable, tables_id) + cm.set_key(selected_rtable, tables_id) if config["ml_blocker"] == "over_lap": blocker = em.OverlapBlocker() candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], @@ -145,13 +141,13 @@ class Classifier: candidate_match_rows = [] for index, row in candidate.iterrows(): - l_id = row['ltable_' + ltable_id] + l_id = row['ltable_' + tables_id] map_row = mappings[mappings[mapping_lid] == l_id] if map_row is not None: r_id = map_row[mapping_rid] for value in r_id: - if value == row['rtable_' + rtable_id]: + if value == row['rtable_' + tables_id]: candidate_match_rows.append(row["_id"]) else: continue @@ -165,9 +161,12 @@ class Classifier: candidate_mismatch = candidate_mismatch.sample(n=len(candidate_match)) # 拼接正负样本 candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match]) + if len(candidate_for_train_test) == 0: + return 1 + cm.set_key(candidate_for_train_test, '_id') - cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id) - cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id) + cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id) + cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id) cm.set_ltable(candidate_for_train_test, selected_ltable) cm.set_rtable(candidate_for_train_test, selected_rtable) @@ -178,7 +177,18 @@ class Classifier: train_set = sets['train'] test_set = sets['test'] - matcher = None + cm.set_key(train_set, '_id') + cm.set_fk_ltable(train_set, 'ltable_' + tables_id) + cm.set_fk_rtable(train_set, 'rtable_' + tables_id) + cm.set_ltable(train_set, selected_ltable) + cm.set_rtable(train_set, selected_rtable) + + cm.set_key(test_set, '_id') + cm.set_fk_ltable(test_set, 'ltable_' + tables_id) + cm.set_fk_rtable(test_set, 'rtable_' + tables_id) + cm.set_ltable(test_set, selected_ltable) + cm.set_rtable(test_set, selected_rtable) + if config["ml_matcher"] == "dt": matcher = em.DTMatcher(name='DecisionTree', random_state=0) elif config["ml_matcher"] == "svm": @@ -198,25 +208,21 @@ class Classifier: attrs_after=['gold'], show_progress=False) - # todo 属性名解耦 + test_feature_after = attrs_with_l_prefix[:] + test_feature_after.extend(attrs_with_r_prefix) + for _ in test_feature_after: + if _.endswith(tables_id): + test_feature_after.remove(_) + test_feature_after.append('gold') test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, - attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer', - 'ltable_price', 'rtable_name', 'rtable_description', - 'rtable_manufacturer', 'rtable_price', 'gold'], show_progress=False) - - # todo 参数可调 用drop删除特征向量中的列? - # 1.exclude_attrs - # 去掉id相关的相似度 - matcher.fit(table=train_feature_vecs, - exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], - target_attr='gold') - - # 1.exclude_attrs - predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_title', - 'ltable_description', 'ltable_manufacturer', - 'ltable_price', 'rtable_name', 'rtable_description', - 'rtable_manufacturer', 'rtable_price', 'gold'], - append=True, target_attr='predicted', inplace=False) + attrs_after=test_feature_after, show_progress=False) + + fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold'] + matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') + + test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id]) + predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after, + append=True, target_attr='predicted', inplace=False) eval_result = em.eval_matches(predictions, 'gold', 'predicted') em.print_eval_summary(eval_result) indicators = evaluate_prediction(predictions, 'gold', 'predicted', matching_number, test_proportion) @@ -231,34 +237,42 @@ class Classifier: # 默认路径为 "../md_discovery/output/xxx.txt" # 真阳/假阴 mds/vio 共4个md文件 - md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt', - '../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt'] + md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt', + 'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt'] epl_match = 0 # 可解释,预测match nepl_mismatch = 0 # 不可解释,预测mismatch md_list = load_mds(md_paths) # 从全局变量中读取所有的md - for row in predictions.itertuples(): - if is_explicable(row, md_list): - if getattr(row, 'predicted') == 1: - epl_match += 1 - else: - if getattr(row, 'predicted') == 0: - nepl_mismatch += 1 - epl_ability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性 - f1 = indicators['F1'] - performance = interpretability_weight * epl_ability + (1 - interpretability_weight) * f1 + if len(md_list) > 0: + for row in predictions.itertuples(): + if is_explicable(row, md_list): + if getattr(row, 'predicted') == 1: + epl_match += 1 + else: + if getattr(row, 'predicted') == 0: + nepl_mismatch += 1 + interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性 + # if indicators["my_recall"] >= 0.8: + # f1 = indicators["F1"] + # else: + # f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"]) + if indicators["my_recall"] < 0.8: + return 1 + f1 = indicators["F1"] + performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 return 1 - performance -if __name__ == "__main__": +def ml_er_hpo(): classifier = Classifier() # Next, we create an object, holding general information about the run scenario = Scenario( classifier.configspace, - n_trials=12, # We want to run max 50 trials (combination of config and seed) + deterministic=True, + n_trials=10, # We want to run max 50 trials (combination of config and seed) ) - initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=3) + initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) # Now we use SMAC to find the best hyperparameters smac = HyperparameterOptimizationFacade( @@ -268,9 +282,6 @@ if __name__ == "__main__": overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state ) - # todo - # 如果new_recall过低则避免其成为最优解 - # 将损失函数置为1/用new_recall降低F1从而提高损失函数 incumbent = smac.optimize() # Get cost of default configuration @@ -280,6 +291,6 @@ if __name__ == "__main__": # Let's calculate the cost of the incumbent incumbent_cost = smac.validate(incumbent) print(f"Incumbent cost: {incumbent_cost}") - print(f"Configuration:{incumbent.values()}") - print(f"MAX_F1:{1-classifier.train(incumbent)}") + + return incumbent diff --git a/md_discovery/script/md_discover.py b/md_discovery/script/md_discover.py index 6a87c9b..c3b9908 100644 --- a/md_discovery/script/md_discover.py +++ b/md_discovery/script/md_discover.py @@ -1,7 +1,7 @@ import time from md_discovery.functions.multi_process_infer_by_pairs import inference_from_record_pairs from md_discovery.functions.multi_process_infer_by_pairs import get_mds_metadata -from entrance import * +from settings import * # # 若不输出support和confidence,使用以下两块代码 # # 将列表1写入本地,路径需自己修改 @@ -19,8 +19,8 @@ from entrance import * def md_discover(): # 目前可以仿照这个main函数写 - tp_single_tuple_path = "../../ml_er/output/tp_single_tuple.csv" - fn_single_tuple_path = "../../ml_er/output/fn_single_tuple.csv" + tp_single_tuple_path = "ml_er/output/tp_single_tuple.csv" + fn_single_tuple_path = "ml_er/output/fn_single_tuple.csv" # 输入:csv文件路径,md左侧相似度阈值,md右侧目标字段 # 输出:2个md列表,列表1中md无violation,列表2中md有violation但confidence满足阈值(0.8) # 例如此处输入参数要求md左侧相似度字段至少为0.7,右侧指向'id'字段 @@ -36,8 +36,8 @@ def md_discover(): # 若输出support和confidence,使用以下两块代码 # 将列表1写入本地,路径需自己修改 - tp_mds_path = "../output/tp_mds.txt" - tp_vio_path = "../output/tp_vio.txt" + tp_mds_path = "md_discovery/output/tp_mds.txt" + tp_vio_path = "md_discovery/output/tp_vio.txt" with open(tp_mds_path, 'w') as f: for _ in tp_mds_meta: @@ -51,8 +51,8 @@ def md_discover(): f.write(i + ':' + str(_[i]) + '\t') f.write('\n') - fn_mds_path = "../output/fn_mds.txt" - fn_vio_path = "../output/fn_vio.txt" + fn_mds_path = "md_discovery/output/fn_mds.txt" + fn_vio_path = "md_discovery/output/fn_vio.txt" with open(fn_mds_path, 'w') as f: for _ in fn_mds_meta: diff --git a/ml_er/Goods Dataset.py b/ml_er/ml_entity_resolver.py similarity index 67% rename from ml_er/Goods Dataset.py rename to ml_er/ml_entity_resolver.py index d864bad..1063eca 100644 --- a/ml_er/Goods Dataset.py +++ b/ml_er/ml_entity_resolver.py @@ -1,3 +1,4 @@ +import os import sys from py_entitymatching.debugmatcher.debug_gui_utils import _get_metric @@ -11,19 +12,18 @@ import time import six from ConfigSpace import Configuration from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio -from entrance import * -from hpo.magellan_hpo import incumbent +from settings import * -def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "output/tp_single_tuple.csv", fn_single_tuple_path: str = "output/fn_single_tuple.csv"): +def process_prediction_for_md_discovery(pred: pd.DataFrame, tp_single_tuple_path: str = "ml_er/output/tp_single_tuple.csv", fn_single_tuple_path: str = "ml_er/output/fn_single_tuple.csv"): # 提取预测表中真阳和假阴部分 tp = pred[(pred['gold'] == 1) & (pred['predicted'] == 1)] fn = pred[(pred['gold'] == 1) & (pred['predicted'] == 0)] # 将真阳/假阴表中左右ID调整一致 for index, row in tp.iterrows(): - tp.loc[index, "rtable_id"] = row["ltable_id"] + tp.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] for index, row in fn.iterrows(): - fn.loc[index, "rtable_id"] = row["ltable_id"] + fn.loc[index, "rtable_" + rtable_id] = row["ltable_" + rtable_id] pred_columns = pred.columns.values.tolist() l_columns = [] @@ -93,6 +93,8 @@ def load_mds(paths: list) -> list: all_mds = [] # 传入md路径列表 for md_path in paths: + if not os.path.exists(md_path): + continue mds = [] # 打开每一个md文件 with open(md_path, 'r') as f: @@ -137,14 +139,10 @@ def load_data(left_path: str, right_path: str, mapping_path: str): return left, right, mapping -def ml_er(config: Configuration): +def ml_er(iter_round: int, config: Configuration = None, ): # todo: # if config is not None -> load configs - # else use default configs - # 1. block_attr - # 2. overlap_size - # 3. ml_matcher - # 4. ml_blocker + # else -> use default configs ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') cm.set_key(ltable, ltable_id) ltable.fillna("", inplace=True) @@ -168,28 +166,58 @@ def ml_er(config: Configuration): selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 + tables_id = rtable_id selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 + items_but_id = selected_attrs[:] + items_but_id.remove(tables_id) # 两张表中除了id的字段名 attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs] attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs] - cm.set_key(selected_ltable, ltable_id) - cm.set_key(selected_rtable, rtable_id) + cm.set_key(selected_ltable, tables_id) + cm.set_key(selected_rtable, tables_id) + + if config is not None: + ml_matcher = config["ml_matcher"] + if ml_matcher == "dt": + matcher = em.DTMatcher(name='DecisionTree', random_state=0) + elif ml_matcher == "svm": + matcher = em.SVMMatcher(name='SVM', random_state=0) + elif ml_matcher == "rf": + matcher = em.RFMatcher(name='RF', random_state=0) + elif ml_matcher == "lg": + matcher = em.LogRegMatcher(name='LogReg', random_state=0) + elif ml_matcher == "ln": + matcher = em.LinRegMatcher(name='LinReg') + elif ml_matcher == "nb": + matcher = em.NBMatcher(name='NaiveBayes') + + if config["ml_blocker"] == "over_lap": + blocker = em.OverlapBlocker() + candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], + l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, + overlap_size=config["overlap_size"], show_progress=False) + elif config["ml_blocker"] == "attr_equiv": + blocker = em.AttrEquivalenceBlocker() + candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], + l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1) + else: + matcher = em.RFMatcher(name='RF', random_state=0) + blocker = em.OverlapBlocker() + candidate = blocker.block_tables(selected_ltable, selected_rtable, items_but_id[0], items_but_id[0], + l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, + overlap_size=1, show_progress=False) - blocker = em.OverlapBlocker() - candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name', - l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, - overlap_size=1, show_progress=False) candidate['gold'] = 0 candidate_match_rows = [] for index, row in candidate.iterrows(): - l_id = row['ltable_' + ltable_id] + l_id = row['ltable_' + tables_id] map_row = mappings[mappings[mapping_lid] == l_id] if map_row is not None: r_id = map_row[mapping_rid] for value in r_id: - if value == row['rtable_' + rtable_id]: + if value == row['rtable_' + tables_id]: candidate_match_rows.append(row["_id"]) else: continue @@ -204,19 +232,18 @@ def ml_er(config: Configuration): # 拼接正负样本 candidate_for_train_test = pd.concat([candidate_mismatch, candidate_match]) cm.set_key(candidate_for_train_test, '_id') - cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + ltable_id) - cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + rtable_id) + cm.set_fk_ltable(candidate_for_train_test, 'ltable_' + tables_id) + cm.set_fk_rtable(candidate_for_train_test, 'rtable_' + tables_id) cm.set_ltable(candidate_for_train_test, selected_ltable) cm.set_rtable(candidate_for_train_test, selected_rtable) # 分为训练测试集 train_proportion = 0.7 test_proportion = 0.3 - sets = em.split_train_test(candidate_for_train_test, train_proportion=0.7, random_state=0) + sets = em.split_train_test(candidate_for_train_test, train_proportion=train_proportion, random_state=0) train_set = sets['train'] test_set = sets['test'] - rf = em.RFMatcher(name='RF', random_state=0) feature_table = em.get_features_for_matching(selected_ltable, selected_rtable, validate_inferred_attr_types=False) train_feature_vecs = em.extract_feature_vecs(train_set, @@ -224,20 +251,19 @@ def ml_er(config: Configuration): attrs_after=['gold'], show_progress=False) + test_feature_after = attrs_with_l_prefix[:] + test_feature_after.extend(attrs_with_r_prefix) + for _ in test_feature_after: + if _.endswith(tables_id): + test_feature_after.remove(_) + test_feature_after.append('gold') test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, - attrs_after=['ltable_name', 'ltable_description', 'ltable_manufacturer', - 'ltable_price', 'rtable_name', 'rtable_description', - 'rtable_manufacturer', 'rtable_price', 'gold'], - show_progress=False) - - rf.fit(table=train_feature_vecs, - exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], - target_attr='gold') - predictions = rf.predict(table=test_feature_vecs, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'ltable_name', - 'ltable_description', 'ltable_manufacturer', - 'ltable_price', 'rtable_name', - 'rtable_description', - 'rtable_manufacturer', 'rtable_price', 'gold'], + attrs_after=test_feature_after, show_progress=False) + + fit_exclude = ['_id', 'ltable_' + tables_id, 'rtable_' + tables_id, 'gold'] + matcher.fit(table=train_feature_vecs, exclude_attrs=fit_exclude, target_attr='gold') + test_feature_after.extend(['_id', 'ltable_' + tables_id, 'rtable_' + tables_id]) + predictions = matcher.predict(table=test_feature_vecs, exclude_attrs=test_feature_after, append=True, target_attr='predicted', inplace=False) eval_result = em.eval_matches(predictions, 'gold', 'predicted') em.print_eval_summary(eval_result) @@ -252,28 +278,35 @@ def ml_er(config: Configuration): predictions_attrs.extend(['gold', 'predicted']) predictions = predictions[predictions_attrs] + md_paths = ['md_discovery/output/tp_mds.txt', 'md_discovery/output/tp_vio.txt', + 'md_discovery/output/fn_mds.txt', 'md_discovery/output/fn_vio.txt'] epl_match = 0 # 可解释,预测match nepl_mismatch = 0 # 不可解释,预测mismatch - p_md = "/home/w/A-New Folder/8.14/Goods Dataset/TP_md_list.txt" - p_vio = "/home/w/A-New Folder/8.14/Goods Dataset/TP_vio_list.txt" - md_paths: list = [p_md, p_vio] - md_list = load_mds(md_paths) # 从全局变量中读取所有的md - for row in predictions.itertuples(): - if is_explicable(row, md_list): - if getattr(row, 'predicted') == 1: - epl_match += 1 - else: - if getattr(row, 'predicted') == 0: - nepl_mismatch += 1 - epl_ability = (epl_match + nepl_mismatch) / len(predictions) + md_list = load_mds(md_paths) # 从全局变量中读取所有的md + if len(md_list) > 0: + for row in predictions.itertuples(): + if is_explicable(row, md_list): + if getattr(row, 'predicted') == 1: + epl_match += 1 + else: + if getattr(row, 'predicted') == 0: + nepl_mismatch += 1 + + interpretability = (epl_match + nepl_mismatch) / len(predictions) # 可解释性 + if indicators["my_recall"] >= 0.8: + f1 = indicators["F1"] + else: + f1 = (2.0 * indicators["precision"] * indicators["my_recall"]) / (indicators["precision"] + indicators["my_recall"]) + performance = interpre_weight * interpretability + (1 - interpre_weight) * f1 ################################################################################################################ process_prediction_for_md_discovery(predictions) - output_path = "output/eval_result" + str(time.time()) + ".txt" + output_path = "ml_er/output/eval_result_" + str(iter_round) + ".txt" with open(output_path, 'w') as f: for key, value in six.iteritems(_get_metric(eval_result)): f.write(key + " : " + value) f.write('\n') - f.write('my_recall:' + str(indicators["my_recall"])) - f.write('\n') + f.write('my_recall:' + str(indicators["my_recall"]) + '\n') + f.write('interpretability:' + str(interpretability) + '\n') + f.write('performance:' + str(performance) + '\n') diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..905e567 --- /dev/null +++ b/settings.py @@ -0,0 +1,12 @@ +ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv' +rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv' +mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv' +mapping_lid = 'idAmazon' # mapping表中左表id名 +mapping_rid = 'idGoogleBase' # mapping表中右表id名 +ltable_id = 'id' # 左表id字段名称 +rtable_id = 'id' # 右表id字段名称 +target_attr = 'id' # 进行md挖掘时的目标字段 +lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 +similarity_threshold = 0.7 +confidence_threshold = 0.8 +interpre_weight = 0.3 # 可解释性权重