From ff520728675347ec8f8691d2e1e865ee8d48c47a Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Mon, 13 Nov 2023 10:34:19 +0800
Subject: [PATCH] 11.13

---
 hpo/er_model_hpo.py          |  4 ++--
 md_discovery/md_discover.py  |  1 -
 md_discovery/tmp_discover.py |  5 ++---
 settings.py                  | 23 +++++++++++++----------
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/hpo/er_model_hpo.py b/hpo/er_model_hpo.py
index 16c7d4c..486bfc0 100644
--- a/hpo/er_model_hpo.py
+++ b/hpo/er_model_hpo.py
@@ -63,7 +63,6 @@ class Classifier:
 
     # train 就是整个函数 只需将返回结果由预测变成预测结果的评估
     def train(self, config: Configuration, seed: int = 0) -> float:
-        # print(f"BRAINFUCK:{config.values()}")
         cm.del_catalog()
         attrs_with_l_prefix = ['ltable_' + i for i in selected_attrs]  # 字段名加左前缀
         attrs_with_r_prefix = ['rtable_' + i for i in selected_attrs]  # 字段名加右前缀
@@ -206,6 +205,7 @@ class Classifier:
         #     return 1
         # f1 = indicators["F1"]
         performance = interpre_weight * interpretability + (1 - interpre_weight) * f1
+        print('Interpretability: ', interpretability)
         return 1 - performance
 
 
@@ -221,7 +221,7 @@ def ml_er_hpo():
     scenario = Scenario(
         cs,
         deterministic=True,
-        n_trials=10,  # We want to run max 50 trials (combination of config and seed)
+        n_trials=12,  # We want to run max 50 trials (combination of config and seed)
         n_workers=1
     )
 
diff --git a/md_discovery/md_discover.py b/md_discovery/md_discover.py
index 876513a..ae5d50b 100644
--- a/md_discovery/md_discover.py
+++ b/md_discovery/md_discover.py
@@ -16,7 +16,6 @@ from settings import *
 
 
 def md_discover():
-    # 目前可以仿照这个main函数写
     t_single_tuple_path = er_output_dir + "t_single_tuple.csv"
     # 输入：csv文件路径，md左侧相似度阈值，md右侧目标字段
     # 输出：2个md列表，列表1中md无violation,列表2中md有violation但confidence满足阈值
diff --git a/md_discovery/tmp_discover.py b/md_discovery/tmp_discover.py
index a9f58b6..d4d7870 100644
--- a/md_discovery/tmp_discover.py
+++ b/md_discovery/tmp_discover.py
@@ -59,13 +59,13 @@ def pairs_inference(path, threshold, target_col):
     norm_table_tensor = torch.nn.functional.normalize(table_tensor, dim=2)
     sim_tensor = torch.matmul(norm_table_tensor, norm_table_tensor.transpose(1, 2))
 
-    torch.save(sim_tensor, md_output_dir + "tensor.pt")
+    # torch.save(sim_tensor, md_output_dir + "tensor.pt")
 
     md_list = []
     minimal_vio = []
     init_md = {}
     for col in columns:
-        init_md[col] = 1 if col == target_col else 0
+        init_md[col] = 1 if col == target_col else -1
     md_list.append(init_md)
 
     for row1 in range(0, length - 1):
@@ -145,7 +145,6 @@ def pairs_inference(path, threshold, target_col):
         return md_list, []
 
     remove_list = []
-    # fuck = []
     if len(md_list) > 0:
         md_rm_list = []
         for _ in md_list:
diff --git a/settings.py b/settings.py
index c257342..9537ded 100644
--- a/settings.py
+++ b/settings.py
@@ -1,20 +1,23 @@
 from sentence_transformers import SentenceTransformer
 import numpy as np
 
-ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableA.csv'
-rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\tableB.csv'
-mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\iTunes-Amazon_dirty\matches.csv'
-mapping_lid = 'ltable_id'  # mapping表中左表id名
-mapping_rid = 'rtable_id'  # mapping表中右表id名
+ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
+rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
+mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
+mapping_lid = 'idAbt'  # mapping表中左表id名
+mapping_rid = 'idBuy'  # mapping表中右表id名
 ltable_id = 'id'  # 左表id字段名称
 rtable_id = 'id'  # 右表id字段名称
 target_attr = 'id'  # 进行md挖掘时的目标字段
 # lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
-similarity_threshold = 0.16
-support_threshold = 70
-confidence_threshold = 0.3
-interpre_weight = 1  # 可解释性权重
+
+model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
+similarity_threshold = 0.2
+support_threshold = 100
+confidence_threshold = 0.4
+interpre_weight = 0.5  # 可解释性权重
+
 er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
 md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
 hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
-model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
+