From c349768eaf355682e31d1fd32b97e4e1c340041e Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Wed, 20 Sep 2023 22:42:30 +0800 Subject: [PATCH] 9.20 --- entrance.py | 31 ++++++-------- hpo/magellan_hpo.py | 96 ++++++++++++++++++++++-------------------- ml_er/Goods Dataset.py | 7 +-- 3 files changed, 68 insertions(+), 66 deletions(-) diff --git a/entrance.py b/entrance.py index 6ec509b..a0ac5d6 100644 --- a/entrance.py +++ b/entrance.py @@ -2,10 +2,18 @@ from md_discovery.script.md_discover import md_discover -# todo: magellan ER模块读入初始化配置或hpo配置 -# todo: 模块间的自动化调用 -# 入口到ER/HPO到ER - +ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv' +rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv' +mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv' +mapping_lid = 'idAmazon' # mapping表中左表id名 +mapping_rid = 'idGoogleBase' # mapping表中右表id名 +ltable_id = 'id' # 左表id字段名称 +rtable_id = 'id' # 右表id字段名称 +target_attr = 'id' # 进行md挖掘时的目标字段 +lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 +similarity_threshold = 0.7 +confidence_threshold = 0.8 +interpretability_weight = 0.3 def run(l_table_path, r_table_path, mapping_path): # while The termination condition is not met: @@ -17,18 +25,5 @@ def run(l_table_path, r_table_path, mapping_path): if __name__ == '__main__': - # todo:使用input函数输入变量值(不方便就不用input) - # 7. 距离度量方式 ? - ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv' - rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv' - mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv' - mapping_lid = 'idAmazon' # mapping表中左表id名 - mapping_rid = 'idGoogleBase' # mapping表中右表id名 - ltable_id = 'id' # 左表id字段名称 - rtable_id = 'id' # 右表id字段名称 - target_attr = 'id' # 进行md挖掘时的目标字段 - lr_attrs_map = {'title': 'name'} # 如果两个表中存在对应字段名称不一样的情况,将名称加入列表便于调整一致 - similarity_threshold = 0.7 - confidence_threshold = 0.8 - interpretability_weight = 0.3 + # todo 距离度量用户可设置? print(ltable_path) diff --git a/hpo/magellan_hpo.py b/hpo/magellan_hpo.py index 8376e99..ff5775f 100644 --- a/hpo/magellan_hpo.py +++ b/hpo/magellan_hpo.py @@ -1,20 +1,50 @@ from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float +from ConfigSpace.conditions import InCondition import py_entitymatching as em import py_entitymatching.catalog.catalog_manager as cm import pandas as pd -from py_entitymatching.blocker.blocker import Blocker -from py_entitymatching.matcher.mlmatcher import MLMatcher from smac import HyperparameterOptimizationFacade, Scenario from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio from entrance import * -# todo 距离度量用户可设置 - -# 全局变量,每次迭代后清空列表,加入新的md路径 -# todo: -# 默认路径为 "../md_discovery/output/xxx.txt" -# 真阳/假阴 mds/vio 共4个md文件 +# 数据在外部加载 +######################################################################################################################## +ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') +cm.set_key(ltable, ltable_id) +ltable.fillna("", inplace=True) +rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') +cm.set_key(rtable, rtable_id) +rtable.fillna("", inplace=True) +mappings = pd.read_csv(mapping_path) + +lid_mapping_list = [] +rid_mapping_list = [] +# 全部转为字符串 +ltable = ltable.astype(str) +rtable = rtable.astype(str) +mappings = mappings.astype(str) +matching_number = len(mappings) # 所有阳性样本数,商品数据集应为1300 + +for index, row in mappings.iterrows(): + lid_mapping_list.append(row[mapping_lid]) + rid_mapping_list.append(row[mapping_rid]) +# 仅保留两表中出现在映射表中的行,增大正样本比例 +selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] +selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 +selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] +selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 +attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs] +attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs] +cm.set_key(selected_ltable, ltable_id) +cm.set_key(selected_rtable, rtable_id) +######################################################################################################################## + + +def test_test(): + block_attr_items = selected_attrs[:] + block_attr_items.remove(rtable_id) + print(block_attr_items) def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int, @@ -80,54 +110,27 @@ def is_explicable(row, all_mds: list) -> bool: return False # 遍历结束,不能解释 -class SVM: +class Classifier: @property def configspace(self) -> ConfigurationSpace: # Build Configuration Space which defines all parameters and their ranges cs = ConfigurationSpace(seed=0) - # todo - # block_attr 取消打桩 - block_attr = Categorical("block_attr", ["name", "description", "manufacturer", "price"], default="title") + block_attr_items = selected_attrs[:] + block_attr_items.remove(rtable_id) + + block_attr = Categorical("block_attr", block_attr_items) overlap_size = Integer("overlap_size", (1, 3), default=1) ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf") ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap") + # todo 其他可调参数(如feature table删去某列) + use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"]) cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker]) + cs.add_conditions([use_overlap_size]) return cs # train 就是整个函数 只需将返回结果由预测变成预测结果的评估 - def train(self, config: Configuration) -> float: - ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') - cm.set_key(ltable, ltable_id) - ltable.fillna("", inplace=True) - rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1') - cm.set_key(rtable, rtable_id) - rtable.fillna("", inplace=True) - mappings = pd.read_csv(mapping_path) - - # 仅保留两表中出现在映射表中的行,增大正样本比例 - lid_mapping_list = [] - rid_mapping_list = [] - # 全部转为字符串 - ltable = ltable.astype(str) - rtable = rtable.astype(str) - mappings = mappings.astype(str) - matching_number = len(mappings) # 所有阳性样本数,商品数据集应为1300 - - for index, row in mappings.iterrows(): - lid_mapping_list.append(row[mapping_lid]) - rid_mapping_list.append(row[mapping_rid]) - - selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)] - selected_ltable = selected_ltable.rename(columns=lr_attrs_map) # 参照右表,修改左表中与右表对应但不同名的字段 - selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)] - selected_attrs = selected_ltable.columns.values.tolist() # 两张表中的字段名 - attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs] - attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs] - cm.set_key(selected_ltable, ltable_id) - cm.set_key(selected_rtable, rtable_id) - - blocker = None + def train(self, config: Configuration, seed: int = 0) -> float: if config["ml_blocker"] == "over_lap": blocker = em.OverlapBlocker() candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], @@ -195,6 +198,7 @@ class SVM: attrs_after=['gold'], show_progress=False) + # todo 属性名解耦 test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table, attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer', 'ltable_price', 'rtable_name', 'rtable_description', @@ -225,6 +229,8 @@ class SVM: predictions_attrs.extend(['gold', 'predicted']) predictions = predictions[predictions_attrs] + # 默认路径为 "../md_discovery/output/xxx.txt" + # 真阳/假阴 mds/vio 共4个md文件 md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt', '../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt'] epl_match = 0 # 可解释,预测match @@ -244,7 +250,7 @@ class SVM: if __name__ == "__main__": - classifier = SVM() + classifier = Classifier() # Next, we create an object, holding general information about the run scenario = Scenario( diff --git a/ml_er/Goods Dataset.py b/ml_er/Goods Dataset.py index c77dd03..d864bad 100644 --- a/ml_er/Goods Dataset.py +++ b/ml_er/Goods Dataset.py @@ -141,7 +141,10 @@ def ml_er(config: Configuration): # todo: # if config is not None -> load configs # else use default configs - # 1. overlap_attr + # 1. block_attr + # 2. overlap_size + # 3. ml_matcher + # 4. ml_blocker ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1') cm.set_key(ltable, ltable_id) ltable.fillna("", inplace=True) @@ -172,8 +175,6 @@ def ml_er(config: Configuration): cm.set_key(selected_ltable, ltable_id) cm.set_key(selected_rtable, rtable_id) - # todo 所有可调参数需读入配置并有默认值 - # block 并将gold标记为0 blocker = em.OverlapBlocker() candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name', l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,