From d5f60a4c99a6e158bf2d456ac1be4ca5f02f7e82 Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Mon, 26 Feb 2024 16:03:52 +0800
Subject: [PATCH] =?UTF-8?q?=E6=A0=B9=E6=8D=AEJedAI=E6=A0=B7=E4=BE=8B?=
 =?UTF-8?q?=E5=8A=A0=E5=85=A5=E6=95=B0=E5=80=BC=E7=B1=BB=E8=B6=85=E5=8F=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 hpo/er_model_hpo.py         | 16 ++++++++++++----
 ml_er/ml_entity_resolver.py | 10 ++++------
 settings.py                 |  2 +-
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py
index 13ee960..80aa356 100644
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import json
-from time import *
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
+from ConfigSpace import EqualsCondition
 from ConfigSpace.read_and_write import json as csj
 from smac import HyperparameterOptimizationFacade, Scenario
 from settings import *
@@ -20,8 +20,12 @@ class Classifier:
 
         jed_blocker = Categorical("jed_blocker",
                                   ["Standard", "QGrams", "ExtendedQG", "SuffixArrays", "ExtendedSA"])
+        qgrams = Integer('qgrams', (3, 10), default=6)
+        use_qgrams = EqualsCondition(child=qgrams, parent=jed_blocker, value="QGrams")
         block_attr = Categorical("block_attr", block_attr_items)
-        # filter_ratio = Float("filter_ratio", (0.0, 1.0), default=0.8)
+
+        block_filtering_ratio = Float("block_filtering_ratio", (0.7, 0.95), default=0.8)
+
         meta_blocker = Categorical("meta_blocker",
                                    ["WEP", "WNP", "CEP", "CNP", "BLAST", "RCNP", "RWNP", "CP"])
         weighting_scheme = Categorical("weighting_scheme",
@@ -35,11 +39,14 @@ class Classifier:
                                          ['char_tokenizer', 'word_tokenizer', 'white_space_tokenizer'])
         matching_vectorizer = Categorical("matching_vectorizer",
                                           ['tfidf', 'tf', 'boolean'])
+        matching_sim_thresh = Float("similarity_threshold", (0.05, 0.9))
         clusteror = Categorical("clusteror_name",
-                                ["CCC", "UMC", "EC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
+                                ["CCC", "UMC", "CenterC", "BMC", "MCC", "CC", "CTC", "MCL", "KMAC", "RSRC"])
 
         cs.add_hyperparameters([jed_blocker, block_attr, meta_blocker, weighting_scheme, matching_metric,
-                                matching_tokenizer, matching_vectorizer, clusteror])
+                                matching_tokenizer, matching_vectorizer, clusteror, qgrams, block_filtering_ratio,
+                                matching_sim_thresh])
+        cs.add_conditions([use_qgrams])
         return cs
 
     def train(self, config: Configuration, seed: int = 0) -> float:
@@ -57,6 +64,7 @@ def ml_er_hpo():
 
     scenario = Scenario(
         cs,
+        crash_cost=1,
         deterministic=True,
         n_trials=50,  # We want to run max 50 trials (combination of config and seed)
         n_workers=1
diff --git a/ml_er/ml_entity_resolver.py b/ml_er/ml_entity_resolver.py
index a109e22..ba44db7 100644
--- a/ml_er/ml_entity_resolver.py
+++ b/ml_er/ml_entity_resolver.py
@@ -163,7 +163,7 @@ def er_process(config: Configuration):
         case "Standard":
             blocker = StandardBlocking()
         case "QGrams":
-            blocker = QGramsBlocking()
+            blocker = QGramsBlocking(config["qgrams"])
         case "ExtendedQG":
             blocker = ExtendedQGramsBlocking()
         case "SuffixArrays":
@@ -177,7 +177,7 @@ def er_process(config: Configuration):
     cleaned_blocks = bp.process(blocks, data, tqdm_disable=False)
 
     # block cleaning(optional)
-    bf = BlockFiltering(ratio=0.8)  # todo what is ratio for?
+    bf = BlockFiltering(ratio=config["block_filtering_ratio"])  # todo what is ratio for?
     filtered_blocks = bf.process(cleaned_blocks, data, tqdm_disable=False)
 
     # Comparison Cleaning - Meta Blocking(optional)
@@ -208,7 +208,7 @@ def er_process(config: Configuration):
         tokenizer=config["matching_tokenizer"],
         vectorizer=config["matching_vectorizer"],
         qgram=3,
-        similarity_threshold=0.0
+        similarity_threshold=config["similarity_threshold"]
     )
 
     pairs_graph = em.predict(candidate_pairs_blocks, data, tqdm_disable=True)
@@ -221,8 +221,6 @@ def er_process(config: Configuration):
             clusteror = ConnectedComponentsClustering()
         case "UMC":
             clusteror = UniqueMappingClustering()
-        case "EC":
-            clusteror = ExactClustering()
         case "CenterC":
             clusteror = CenterClustering()
         case "BMC":
@@ -240,7 +238,7 @@ def er_process(config: Configuration):
         case "RSRC":
             clusteror = RicochetSRClustering()
     # 得到预测结果与评估指标
-    clusters = clusteror.process(pairs_graph, data, similarity_threshold=0.17)
+    clusters = clusteror.process(pairs_graph, data, similarity_threshold=0.17)  # todo cluster sim thresh
     matches_dataframe = clusteror.export_to_df(clusters)
     matches_dataframe_path = er_output_dir + r'\matches_dataframe.csv'
     matches_dataframe.to_csv(matches_dataframe_path, sep=',', index=False, header=True, quoting=1)
diff --git a/settings.py b/settings.py
index 1535028..b4b7ce0 100644
--- a/settings.py
+++ b/settings.py
@@ -11,7 +11,7 @@ target_attr = 'id'  # 进行md挖掘时的目标字段
 # lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
 
 model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
-interpre_weight = 0.5  # 可解释性权重
+interpre_weight = 0  # 可解释性权重
 similarity_threshold = 0.1
 support_threshold = 1
 confidence_threshold = 0.25