From 34b2c72646801234d62c2ab67dc4f768cc9cbd3b Mon Sep 17 00:00:00 2001 From: HuangJintao <1447537163@qq.com> Date: Mon, 16 Oct 2023 20:39:06 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E7=BA=BF=E7=A8=8B=E6=B1=A0bu?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + entrance.py | 2 +- md_discovery/functions/multi_process_infer_by_pairs.py | 6 ++++-- ml_er/ml_entity_resolver.py | 2 +- settings.py | 6 +++--- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 52d424e..6c7c123 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /deprecated/ +/datasets/ diff --git a/entrance.py b/entrance.py index f441733..403c4ac 100644 --- a/entrance.py +++ b/entrance.py @@ -23,7 +23,7 @@ if __name__ == '__main__': # todo # 距离度量用户可设置? # 使用drop删除特征向量中的列?(如删除id相关特征) - run(3) # 迭代3轮 + run(1) # 迭代3轮 # ml_er(1) # todo 将优化结果与参数输出到文件中 # 通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息 diff --git a/md_discovery/functions/multi_process_infer_by_pairs.py b/md_discovery/functions/multi_process_infer_by_pairs.py index 8f6b867..4100214 100644 --- a/md_discovery/functions/multi_process_infer_by_pairs.py +++ b/md_discovery/functions/multi_process_infer_by_pairs.py @@ -116,7 +116,8 @@ def inference_from_record_pairs(path, threshold, target_col): lock = manager.Lock() if len(minimal_vio) == 0: return md_list, [] - pool = multiprocessing.Pool(len(minimal_vio)) + pool_size = len(minimal_vio) if len(minimal_vio) < 61 else 60 + pool = multiprocessing.Pool(pool_size) # tmp = copy.deepcopy(minimal_vio) with manager: proxy_minimal_vio = manager.list(minimal_vio) @@ -145,7 +146,8 @@ def get_mds_metadata(md_list, dataset_path, target_col): manager = multiprocessing.Manager() if len(md_list) == 0: return [] - pool = multiprocessing.Pool(len(md_list)) + pool_size = len(md_list) if len(md_list) < 61 else 60 + pool = multiprocessing.Pool(pool_size) result = [] with manager: for _ in md_list: diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py index f766c8c..291c175 100644 --- a/ml_er/ml_entity_resolver.py +++ b/ml_er/ml_entity_resolver.py @@ -198,7 +198,7 @@ def ml_er(iter_round: int, config: Configuration = None, ): elif config["ml_blocker"] == "attr_equiv": blocker = em.AttrEquivalenceBlocker() candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"], - l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1) + l_output_attrs=selected_attrs, r_output_attrs=selected_attrs) else: matcher = em.RFMatcher(name='RF', random_state=0) blocker = em.OverlapBlocker() diff --git a/settings.py b/settings.py index 905e567..3319be4 100644 --- a/settings.py +++ b/settings.py @@ -1,6 +1,6 @@ -ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv' -rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv' -mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv' +ltable_path = 'datasets\\Amazon.csv' +rtable_path = 'datasets\\GoogleProducts.csv' +mapping_path = 'datasets\\Amzon_GoogleProducts_perfectMapping.csv' mapping_lid = 'idAmazon' # mapping表中左表id名 mapping_rid = 'idGoogleBase' # mapping表中右表id名 ltable_id = 'id' # 左表id字段名称