From 34b2c72646801234d62c2ab67dc4f768cc9cbd3b Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Mon, 16 Oct 2023 20:39:06 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E7=BA=BF=E7=A8=8B=E6=B1=A0bu?=
 =?UTF-8?q?g?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                             | 1 +
 entrance.py                                            | 2 +-
 md_discovery/functions/multi_process_infer_by_pairs.py | 6 ++++--
 ml_er/ml_entity_resolver.py                            | 2 +-
 settings.py                                            | 6 +++---
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 52d424e..6c7c123 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /deprecated/
+/datasets/
diff --git a/entrance.py b/entrance.py
index f441733..403c4ac 100644
--- a/entrance.py
+++ b/entrance.py
@@ -23,7 +23,7 @@ if __name__ == '__main__':
     # todo
     #  距离度量用户可设置?
     #  使用drop删除特征向量中的列？(如删除id相关特征)
-    run(3)  # 迭代3轮
+    run(1)  # 迭代3轮
     # ml_er(1)
     # todo 将优化结果与参数输出到文件中
     #  通过ml_entity_resolver.ml_er()输出,同时输出参数配置信息
diff --git a/md_discovery/functions/multi_process_infer_by_pairs.py b/md_discovery/functions/multi_process_infer_by_pairs.py
index 8f6b867..4100214 100644
--- a/md_discovery/functions/multi_process_infer_by_pairs.py
+++ b/md_discovery/functions/multi_process_infer_by_pairs.py
@@ -116,7 +116,8 @@ def inference_from_record_pairs(path, threshold, target_col):
     lock = manager.Lock()
     if len(minimal_vio) == 0:
         return md_list, []
-    pool = multiprocessing.Pool(len(minimal_vio))
+    pool_size = len(minimal_vio) if len(minimal_vio) < 61 else 60
+    pool = multiprocessing.Pool(pool_size)
     # tmp = copy.deepcopy(minimal_vio)
     with manager:
         proxy_minimal_vio = manager.list(minimal_vio)
@@ -145,7 +146,8 @@ def get_mds_metadata(md_list, dataset_path, target_col):
     manager = multiprocessing.Manager()
     if len(md_list) == 0:
         return []
-    pool = multiprocessing.Pool(len(md_list))
+    pool_size = len(md_list) if len(md_list) < 61 else 60
+    pool = multiprocessing.Pool(pool_size)
     result = []
     with manager:
         for _ in md_list:
diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py
index f766c8c..291c175 100644
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@@ -198,7 +198,7 @@ def ml_er(iter_round: int, config: Configuration = None, ):
         elif config["ml_blocker"] == "attr_equiv":
             blocker = em.AttrEquivalenceBlocker()
             candidate = blocker.block_tables(selected_ltable, selected_rtable, config["block_attr"], config["block_attr"],
-                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs, n_jobs=-1)
+                                             l_output_attrs=selected_attrs, r_output_attrs=selected_attrs)
     else:
         matcher = em.RFMatcher(name='RF', random_state=0)
         blocker = em.OverlapBlocker()
diff --git a/settings.py b/settings.py
index 905e567..3319be4 100644
--- a/settings.py
+++ b/settings.py
@@ -1,6 +1,6 @@
-ltable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amazon.csv'
-rtable_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/GoogleProducts.csv'
-mapping_path = '/home/w/PycharmProjects/py_entitymatching/py_entitymatching/datasets/end-to-end/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv'
+ltable_path = 'datasets\\Amazon.csv'
+rtable_path = 'datasets\\GoogleProducts.csv'
+mapping_path = 'datasets\\Amzon_GoogleProducts_perfectMapping.csv'
 mapping_lid = 'idAmazon'  # mapping表中左表id名
 mapping_rid = 'idGoogleBase'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称