9.20

2 years ago · c349768eaf
parent 27795aac1d
commit c349768eaf
3 changed files with 68 additions and 66 deletions
--- a/entrance.py
+++ b/entrance.py
@ -2,10 +2,18 @@
 from md_discovery.script.md_discover import md_discover
-# todo: magellan ER模块读入初始化配置或hpo配置
+ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
-# todo: 模块间的自动化调用
+rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
-#  入口到ER/HPO到ER
+mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
-
+mapping_lid = 'idAmazon'  # mapping表中左表id名
 mapping_rid = 'idGoogleBase'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
 lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
 similarity_threshold = 0.7
 confidence_threshold = 0.8
 interpretability_weight = 0.3
 def run(l_table_path, r_table_path, mapping_path):
    # while The termination condition is not met:
@ -17,18 +25,5 @@ def run(l_table_path, r_table_path, mapping_path):
 if __name__ == '__main__':
-    # todo：使用input函数输入变量值（不方便就不用input）
+    # todo 距离度量用户可设置?
    #  7. 距离度量方式    ？
    ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
    rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
    mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
    mapping_lid = 'idAmazon'  # mapping表中左表id名
    mapping_rid = 'idGoogleBase'  # mapping表中右表id名
    ltable_id = 'id'  # 左表id字段名称
    rtable_id = 'id'  # 右表id字段名称
    target_attr = 'id'  # 进行md挖掘时的目标字段
    lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
    similarity_threshold = 0.7
    confidence_threshold = 0.8
    interpretability_weight = 0.3
    print(ltable_path)
--- a/hpo/magellan_hpo.py
+++ b/hpo/magellan_hpo.py
@ -1,20 +1,50 @@
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
 from ConfigSpace.conditions import InCondition
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
 from py_entitymatching.blocker.blocker import Blocker
 from py_entitymatching.matcher.mlmatcher import MLMatcher
 from smac import HyperparameterOptimizationFacade, Scenario
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
 from entrance import *
 # todo 距离度量用户可设置
 # 全局变量，每次迭代后清空列表，加入新的md路径
 # todo:
 #  默认路径为 "../md_discovery/output/xxx.txt"
 #  真阳/假阴  mds/vio  共4个md文件
 # 数据在外部加载
 ########################################################################################################################
 ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
 cm.set_key(ltable, ltable_id)
 ltable.fillna("", inplace=True)
 rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
 cm.set_key(rtable, rtable_id)
 rtable.fillna("", inplace=True)
 mappings = pd.read_csv(mapping_path)
 lid_mapping_list = []
 rid_mapping_list = []
 # 全部转为字符串
 ltable = ltable.astype(str)
 rtable = rtable.astype(str)
 mappings = mappings.astype(str)
 matching_number = len(mappings)  # 所有阳性样本数，商品数据集应为1300
 for index, row in mappings.iterrows():
    lid_mapping_list.append(row[mapping_lid])
    rid_mapping_list.append(row[mapping_rid])
 # 仅保留两表中出现在映射表中的行，增大正样本比例
 selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
 selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
 selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
 selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
 attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
 attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
 cm.set_key(selected_ltable, ltable_id)
 cm.set_key(selected_rtable, rtable_id)
 ########################################################################################################################
 def test_test():
    block_attr_items = selected_attrs[:]
    block_attr_items.remove(rtable_id)
    print(block_attr_items)
 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@ -80,54 +110,27 @@ def is_explicable(row, all_mds: list) -> bool:
    return False  # 遍历结束，不能解释
-class SVM:
+class Classifier:
    @property
    def configspace(self) -> ConfigurationSpace:
        # Build Configuration Space which defines all parameters and their ranges
        cs = ConfigurationSpace(seed=0)
-        # todo
+        block_attr_items = selected_attrs[:]
-        #  block_attr 取消打桩
+        block_attr_items.remove(rtable_id)
-        block_attr = Categorical("block_attr", ["name", "description", "manufacturer", "price"], default="title")
+
        block_attr = Categorical("block_attr", block_attr_items)
        overlap_size = Integer("overlap_size", (1, 3), default=1)
        ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
        ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
        # todo 其他可调参数(如feature table删去某列)
        use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
        cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
        cs.add_conditions([use_overlap_size])
        return cs
    # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
-    def train(self, config: Configuration) -> float:
+    def train(self, config: Configuration, seed: int = 0) -> float:
        ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
        cm.set_key(ltable, ltable_id)
        ltable.fillna("", inplace=True)
        rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
        cm.set_key(rtable, rtable_id)
        rtable.fillna("", inplace=True)
        mappings = pd.read_csv(mapping_path)
        # 仅保留两表中出现在映射表中的行，增大正样本比例
        lid_mapping_list = []
        rid_mapping_list = []
        # 全部转为字符串
        ltable = ltable.astype(str)
        rtable = rtable.astype(str)
        mappings = mappings.astype(str)
        matching_number = len(mappings)  # 所有阳性样本数，商品数据集应为1300
        for index, row in mappings.iterrows():
            lid_mapping_list.append(row[mapping_lid])
            rid_mapping_list.append(row[mapping_rid])
        selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
        selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
        selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
        selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
        attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
        attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
        cm.set_key(selected_ltable, ltable_id)
        cm.set_key(selected_rtable, rtable_id)
        blocker = None
        if config["ml_blocker"] == "over_lap":
            blocker = em.OverlapBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
@ -195,6 +198,7 @@ class SVM:
                                                     attrs_after=['gold'],
                                                     show_progress=False)
        # todo 属性名解耦
        test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
                                                    attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer',
                                                                 'ltable_price', 'rtable_name', 'rtable_description',
@ -225,6 +229,8 @@ class SVM:
        predictions_attrs.extend(['gold', 'predicted'])
        predictions = predictions[predictions_attrs]
        #  默认路径为 "../md_discovery/output/xxx.txt"
        #  真阳/假阴  mds/vio  共4个md文件
        md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt',
                    '../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt']
        epl_match = 0  # 可解释，预测match
@ -244,7 +250,7 @@ class SVM:
 if __name__ == "__main__":
-    classifier = SVM()
+    classifier = Classifier()
    # Next, we create an object, holding general information about the run
    scenario = Scenario(
--- a/ml_er/Goods
+++ b/ml_er/Goods
@ -141,7 +141,10 @@ def ml_er(config: Configuration):
    # todo:
    #  if config is not None -> load configs
    #  else use default configs
-    #  1. overlap_attr
+    #  1. block_attr
    #  2. overlap_size
    #  3. ml_matcher
    #  4. ml_blocker
    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
    cm.set_key(ltable, ltable_id)
    ltable.fillna("", inplace=True)
@ -172,8 +175,6 @@ def ml_er(config: Configuration):
    cm.set_key(selected_ltable, ltable_id)
    cm.set_key(selected_rtable, rtable_id)
    # todo 所有可调参数需读入配置并有默认值
    # block 并将gold标记为0
    blocker = em.OverlapBlocker()
    candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name',
                                     l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,