9.20

2 years ago · c349768eaf
parent 27795aac1d
commit c349768eaf
3 changed files with 68 additions and 66 deletions
--- a/entrance.py
+++ b/entrance.py
@ -2,10 +2,18 @@
 from md_discovery.script.md_discover import md_discover


-# todo: magellan ER模块读入初始化配置或hpo配置
-# todo: 模块间的自动化调用
-#  入口到ER/HPO到ER
-
+ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
+rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
+mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
+mapping_lid = 'idAmazon'  # mapping表中左表id名
+mapping_rid = 'idGoogleBase'  # mapping表中右表id名
+ltable_id = 'id'  # 左表id字段名称
+rtable_id = 'id'  # 右表id字段名称
+target_attr = 'id'  # 进行md挖掘时的目标字段
+lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
+similarity_threshold = 0.7
+confidence_threshold = 0.8
+interpretability_weight = 0.3

 def run(l_table_path, r_table_path, mapping_path):
    # while The termination condition is not met:
@ -17,18 +25,5 @@ def run(l_table_path, r_table_path, mapping_path):


 if __name__ == '__main__':
-    # todo：使用input函数输入变量值（不方便就不用input）
-    #  7. 距离度量方式    ？
-    ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
-    rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
-    mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
-    mapping_lid = 'idAmazon'  # mapping表中左表id名
-    mapping_rid = 'idGoogleBase'  # mapping表中右表id名
-    ltable_id = 'id'  # 左表id字段名称
-    rtable_id = 'id'  # 右表id字段名称
-    target_attr = 'id'  # 进行md挖掘时的目标字段
-    lr_attrs_map = {'title': 'name'}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
-    similarity_threshold = 0.7
-    confidence_threshold = 0.8
-    interpretability_weight = 0.3
+    # todo 距离度量用户可设置?
    print(ltable_path)
--- a/hpo/magellan_hpo.py
+++ b/hpo/magellan_hpo.py
@ -1,20 +1,50 @@
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
+from ConfigSpace.conditions import InCondition
 import py_entitymatching as em
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
-from py_entitymatching.blocker.blocker import Blocker
-from py_entitymatching.matcher.mlmatcher import MLMatcher

 from smac import HyperparameterOptimizationFacade, Scenario
 from md_discovery.functions.multi_process_infer_by_pairs import my_Levenshtein_ratio
 from entrance import *
-# todo 距离度量用户可设置

-# 全局变量，每次迭代后清空列表，加入新的md路径
-# todo:
-#  默认路径为 "../md_discovery/output/xxx.txt"
-#  真阳/假阴  mds/vio  共4个md文件
+# 数据在外部加载
+########################################################################################################################
+ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
+cm.set_key(ltable, ltable_id)
+ltable.fillna("", inplace=True)
+rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
+cm.set_key(rtable, rtable_id)
+rtable.fillna("", inplace=True)
+mappings = pd.read_csv(mapping_path)
+
+lid_mapping_list = []
+rid_mapping_list = []
+# 全部转为字符串
+ltable = ltable.astype(str)
+rtable = rtable.astype(str)
+mappings = mappings.astype(str)
+matching_number = len(mappings)  # 所有阳性样本数，商品数据集应为1300
+
+for index, row in mappings.iterrows():
+    lid_mapping_list.append(row[mapping_lid])
+    rid_mapping_list.append(row[mapping_rid])
+# 仅保留两表中出现在映射表中的行，增大正样本比例
+selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
+selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
+selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
+selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
+attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
+attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
+cm.set_key(selected_ltable, ltable_id)
+cm.set_key(selected_rtable, rtable_id)
+########################################################################################################################
+

+def test_test():
+    block_attr_items = selected_attrs[:]
+    block_attr_items.remove(rtable_id)
+    print(block_attr_items)


 def evaluate_prediction(df: pd.DataFrame, labeled_attr: str, predicted_attr: str, matching_number: int,
@ -80,54 +110,27 @@ def is_explicable(row, all_mds: list) -> bool:
    return False  # 遍历结束，不能解释


-class SVM:
+class Classifier:
    @property
    def configspace(self) -> ConfigurationSpace:
        # Build Configuration Space which defines all parameters and their ranges
        cs = ConfigurationSpace(seed=0)
-        # todo
-        #  block_attr 取消打桩
-        block_attr = Categorical("block_attr", ["name", "description", "manufacturer", "price"], default="title")
+        block_attr_items = selected_attrs[:]
+        block_attr_items.remove(rtable_id)
+
+        block_attr = Categorical("block_attr", block_attr_items)
        overlap_size = Integer("overlap_size", (1, 3), default=1)
        ml_matcher = Categorical("ml_matcher", ["dt", "svm", "rf", "lg", "ln", "nb"], default="rf")
        ml_blocker = Categorical("ml_blocker", ["over_lap", "attr_equiv"], default="over_lap")
+        # todo 其他可调参数(如feature table删去某列)

+        use_overlap_size = InCondition(child=overlap_size, parent=ml_blocker, values=["over_lap"])
        cs.add_hyperparameters([block_attr, overlap_size, ml_matcher, ml_blocker])
+        cs.add_conditions([use_overlap_size])
        return cs

    # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
-    def train(self, config: Configuration) -> float:
-        ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
-        cm.set_key(ltable, ltable_id)
-        ltable.fillna("", inplace=True)
-        rtable = pd.read_csv(rtable_path, encoding='ISO-8859-1')
-        cm.set_key(rtable, rtable_id)
-        rtable.fillna("", inplace=True)
-        mappings = pd.read_csv(mapping_path)
-
-        # 仅保留两表中出现在映射表中的行，增大正样本比例
-        lid_mapping_list = []
-        rid_mapping_list = []
-        # 全部转为字符串
-        ltable = ltable.astype(str)
-        rtable = rtable.astype(str)
-        mappings = mappings.astype(str)
-        matching_number = len(mappings)  # 所有阳性样本数，商品数据集应为1300
-
-        for index, row in mappings.iterrows():
-            lid_mapping_list.append(row[mapping_lid])
-            rid_mapping_list.append(row[mapping_rid])
-
-        selected_ltable = ltable[ltable[ltable_id].isin(lid_mapping_list)]
-        selected_ltable = selected_ltable.rename(columns=lr_attrs_map)  # 参照右表，修改左表中与右表对应但不同名的字段
-        selected_rtable = rtable[rtable[rtable_id].isin(rid_mapping_list)]
-        selected_attrs = selected_ltable.columns.values.tolist()  # 两张表中的字段名
-        attrs_with_l_prefix = ['ltable_'+i for i in selected_attrs]
-        attrs_with_r_prefix = ['rtable_'+i for i in selected_attrs]
-        cm.set_key(selected_ltable, ltable_id)
-        cm.set_key(selected_rtable, rtable_id)
-
-        blocker = None
+    def train(self, config: Configuration, seed: int = 0) -> float:
        if config["ml_blocker"] == "over_lap":
            blocker = em.OverlapBlocker()
            candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
@ -195,6 +198,7 @@ class SVM:
                                                     attrs_after=['gold'],
                                                     show_progress=False)

+        # todo 属性名解耦
        test_feature_vecs = em.extract_feature_vecs(test_set, feature_table=feature_table,
                                                    attrs_after=['ltable_title', 'ltable_description', 'ltable_manufacturer',
                                                                 'ltable_price', 'rtable_name', 'rtable_description',
@ -225,6 +229,8 @@ class SVM:
        predictions_attrs.extend(['gold', 'predicted'])
        predictions = predictions[predictions_attrs]

+        #  默认路径为 "../md_discovery/output/xxx.txt"
+        #  真阳/假阴  mds/vio  共4个md文件
        md_paths = ['../md_discovery/output/tp_mds.txt', '../md_discovery/output/tp_vio.txt',
                    '../md_discovery/output/fn_mds.txt', '../md_discovery/output/fn_vio.txt']
        epl_match = 0  # 可解释，预测match
@ -244,7 +250,7 @@ class SVM:


 if __name__ == "__main__":
-    classifier = SVM()
+    classifier = Classifier()

    # Next, we create an object, holding general information about the run
    scenario = Scenario(
--- a/ml_er/Goods
+++ b/ml_er/Goods
@ -141,7 +141,10 @@ def ml_er(config: Configuration):
    # todo:
    #  if config is not None -> load configs
    #  else use default configs
-    #  1. overlap_attr
+    #  1. block_attr
+    #  2. overlap_size
+    #  3. ml_matcher
+    #  4. ml_blocker
    ltable = pd.read_csv(ltable_path, encoding='ISO-8859-1')
    cm.set_key(ltable, ltable_id)
    ltable.fillna("", inplace=True)
@ -172,8 +175,6 @@ def ml_er(config: Configuration):
    cm.set_key(selected_ltable, ltable_id)
    cm.set_key(selected_rtable, rtable_id)

-    # todo 所有可调参数需读入配置并有默认值
-    # block 并将gold标记为0
    blocker = em.OverlapBlocker()
    candidate = blocker.block_tables(selected_ltable, selected_rtable, 'name', 'name',
                                     l_output_attrs=selected_attrs, r_output_attrs=selected_attrs,