From 20c33c0fd8fa7271b88b4407a328a999d4c81d70 Mon Sep 17 00:00:00 2001
From: HuangJintao <1447537163@qq.com>
Date: Thu, 26 Oct 2023 23:00:47 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=A9=E7=94=A8GPU=E7=AE=97metrics?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                                    |   2 -
 md_discovery/multi_process_infer_by_pairs.py |  35 +++---
 md_discovery/tmp_discover.py                 | 115 ++++++++++++-------
 samples_generator.py                         | 114 ------------------
 tfile.py                                     |  84 +++++++++-----
 5 files changed, 149 insertions(+), 201 deletions(-)
 delete mode 100644 README.md
 delete mode 100644 samples_generator.py

diff --git a/README.md b/README.md
deleted file mode 100644
index 3c8a8c8..0000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# matching_dependency
-
diff --git a/md_discovery/multi_process_infer_by_pairs.py b/md_discovery/multi_process_infer_by_pairs.py
index b40970f..d89e51f 100644
--- a/md_discovery/multi_process_infer_by_pairs.py
+++ b/md_discovery/multi_process_infer_by_pairs.py
@@ -7,7 +7,7 @@ import time
 import torch
 from tqdm import tqdm
 from transformers import AutoTokenizer, AutoModel
-from settings import model, embedding_dict
+from settings import model, embedding_dict, er_output_dir
 from sentence_transformers.util import cos_sim
 
 conf_thresh = 0.8
@@ -38,12 +38,13 @@ def table_encode(tp_path, fn_path):
         for col in range(0, tp_width):
             cell_value = tp_data.values[row, col]
             tp_sentences.append(cell_value)
-    tp_embedding = model.encode(tp_sentences, convert_to_tensor=True)
+    tp_embedding = model.encode(tp_sentences, convert_to_tensor=True, device="cuda")
 
+    list_tp_embedding = tp_embedding.tolist()
     for row in range(0, tp_length):
         for col in range(0, tp_width):
             cell_value = tp_data.values[row, col]
-            embedding_dic[cell_value] = tp_embedding.tolist()[row * tp_width + col]
+            embedding_dic[cell_value] = list_tp_embedding[row * tp_width + col]
 
 
     fn_data = pd.read_csv(fn_path, low_memory=False, encoding='ISO-8859-1')
@@ -57,19 +58,20 @@ def table_encode(tp_path, fn_path):
         for col in range(0, fn_width):
             cell_value = fn_data.values[row, col]
             fn_sentences.append(cell_value)
-    fn_embedding = model.encode(fn_sentences, convert_to_tensor=True)
+    fn_embedding = model.encode(fn_sentences, convert_to_tensor=True, device="cuda")
 
+    list_fn_embedding = fn_embedding.tolist()
     for row in range(0, fn_length):
         for col in range(0, fn_width):
             cell_value = fn_data.values[row, col]
-            embedding_dic[cell_value] = fn_embedding.tolist()[row * fn_width + col]
+            embedding_dic[cell_value] = list_fn_embedding[row * fn_width + col]
 
     np.save('embedding_dic.npy', embedding_dic)
 
 
 def test_table_encode():
     start = time.time()
-    table_encode('../ml_er/output/tp_single_tuple.csv', '../ml_er/output/fn_single_tuple.csv')
+    table_encode(er_output_dir+'tp_single_tuple.csv', er_output_dir+'fn_single_tuple.csv')
     print(time.time()-start)
 
 
@@ -133,6 +135,7 @@ def inference_from_record_pairs(path, threshold, target_col):
     data.fillna("", inplace=True)
     data = data.astype(str)
     columns = data.columns.values.tolist()
+    other_columns = list(set(columns) - {target_col})
 
     md_list = []
     minimal_vio = []
@@ -141,7 +144,7 @@ def inference_from_record_pairs(path, threshold, target_col):
         init_md[col] = 1 if col == target_col else 0
     md_list.append(init_md)
 
-    for row1 in data.itertuples():
+    for row1 in tqdm(data.itertuples()):
         # 获取当前行的索引，从后一行开始切片
         i = row1[0]
         data1 = data[i + 1:]
@@ -158,7 +161,7 @@ def inference_from_record_pairs(path, threshold, target_col):
             for md in md_list[:]:
                 lhs_satis = True
                 rhs_satis = True
-                for col in list(set(columns) - {target_col}):
+                for col in other_columns:
                     if sims[col] + 0.0000001 < md[col]:
                         lhs_satis = False
                         break
@@ -167,7 +170,7 @@ def inference_from_record_pairs(path, threshold, target_col):
                 if lhs_satis == True and rhs_satis == False:
                     md_list.remove(md)
                     violated_mds.append(md)
-            minimal_vio.extend(violated_mds)
+            # minimal_vio.extend(violated_mds)
 
             for vio_md in violated_mds:
                 # 特殊化右侧,我们需要右侧百分百相似，其实不需要降低右侧阈值
@@ -179,16 +182,16 @@ def inference_from_record_pairs(path, threshold, target_col):
                 #         md_list.append(spec_r_md)
 
                 # 特殊化左侧
-                for col in list(set(columns) - {target_col}):
+                for col in other_columns:
                     if sims[col] + 0.01 <= 1:
                         spec_l_md = copy.deepcopy(vio_md)
                         spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
                         if if_minimal(spec_l_md, md_list, target_col):
                             md_list.append(spec_l_md)
 
-            for vio in minimal_vio[:]:
-                if not if_minimal(vio, md_list, target_col):
-                    minimal_vio.remove(vio)
+            # for vio in minimal_vio[:]:
+            #     if not if_minimal(vio, md_list, target_col):
+            #         minimal_vio.remove(vio)
 
     # fuck = len(minimal_vio)
     # tmp = []
@@ -216,9 +219,9 @@ def inference_from_record_pairs(path, threshold, target_col):
     #     if not if_minimal(_, minimal_vio, target_col):
     #         minimal_vio.remove(_)
     #
-    # for _ in md_list[:]:
-    #     if not if_minimal(_, md_list, target_col):
-    #         md_list.remove(_)
+    for _ in md_list[:]:
+        if not if_minimal(_, md_list, target_col):
+            md_list.remove(_)
 
     return md_list, minimal_vio
 
diff --git a/md_discovery/tmp_discover.py b/md_discovery/tmp_discover.py
index 0edf1b9..b75ee07 100644
--- a/md_discovery/tmp_discover.py
+++ b/md_discovery/tmp_discover.py
@@ -70,6 +70,7 @@ def inference_from_record_pairs(path, threshold, target_col):
     data.fillna("", inplace=True)
     data = data.astype(str)
     columns = data.columns.values.tolist()
+    target_index = columns.index(target_col)
     cols_but_target = list(set(columns) - {target_col})
     length = data.shape[0]
     width = data.shape[1]
@@ -95,7 +96,6 @@ def inference_from_record_pairs(path, threshold, target_col):
         init_md[col] = 1 if col == target_col else 0
     md_list.append(init_md)
 
-    start = time.time()
     for row1 in range(0, length - 1):
         terminate = False
         for row2 in range(row1 + 1, length):
@@ -132,9 +132,9 @@ def inference_from_record_pairs(path, threshold, target_col):
                 #         md_list.append(spec_r_md)
                 # 特殊化左侧
                 for col in list(set(columns) - {target_col}):
-                    if sims[col] + 0.05 <= 1:
+                    if sims[col] + 0.01 <= 1:
                         spec_l_md = copy.deepcopy(vio_md)
-                        spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.05
+                        spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
                         if if_minimal(spec_l_md, md_list, target_col):
                             md_list.append(spec_l_md)
                 if vio_md not in minimal_vio:
@@ -151,9 +151,8 @@ def inference_from_record_pairs(path, threshold, target_col):
             if not if_minimal(vio, md_list, target_col):
                 minimal_vio.remove(vio)
 
-    print(time.time()-start, '\n')
-    print(len(md_list), '\n')
-    print(len(minimal_vio), '\n')
+    print('mds_list\t', len(md_list), '\n')
+    print('vio_list\t', len(minimal_vio), '\n')
 
     if len(minimal_vio) == 0:
         return md_list, []
@@ -170,49 +169,87 @@ def inference_from_record_pairs(path, threshold, target_col):
     #     pool.join()
     #     minimal_vio = list(proxy_minimal_vio)
 
-    start = time.time()
-    minimal_vio.reverse()
+    # minimal_vio.reverse()
     i = 0
-    while i < len(minimal_vio):
-        print(i)
-        print(len(minimal_vio))
-        current_md = minimal_vio[i]
-        support, confidence = get_one_md_metadata(current_md, data, sim_tensor, target_col)
-        if support < 50:
-            minimal_vio_length = len(minimal_vio)
-            j = i + 1
-            while j < len(minimal_vio):
-                specialization = True
-                next_md = minimal_vio[j]
-                for col in cols_but_target:
-                    if current_md[col] > next_md[col]:
-                        specialization = False
-                        break
-                if specialization:
-                    minimal_vio.remove(next_md)
-                else:
-                    j += 1
-            print('sup')
-            minimal_vio.remove(current_md)
+    remove_list = []
+    fuck = []
+    for md in minimal_vio:
+        support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
+        fuck.append((support, confidence))
+        if support < 1:
+            print('delete by support')
+            remove_list.append(md)
         if confidence < 0.8:
-            print('conf')
-            minimal_vio.remove(current_md)
-        if support >= 50 and confidence >= 0.8:
-            i += 1
-    print(time.time()-start)
-
-
+            print('delete by confidence')
+            remove_list.append(md)
+    fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
+    # while i < len(minimal_vio):
+    #     print('vio_index\t', i)
+    #     print('vio_length', len(minimal_vio))
+    #     current_md = minimal_vio[i]
+    #     support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index)
+    #     # if support < 50:
+    #     #     minimal_vio_length = len(minimal_vio)
+    #     #     j = i + 1
+    #     #     while j < len(minimal_vio):
+    #     #         specialization = True
+    #     #         next_md = minimal_vio[j]
+    #     #         for col in cols_but_target:
+    #     #             if current_md[col] > next_md[col]:
+    #     #                 specialization = False
+    #     #                 break
+    #     #         if specialization:
+    #     #             minimal_vio.remove(next_md)
+    #     #         else:
+    #     #             j += 1
+    #     #     print('sup')
+    #     #     minimal_vio.remove(current_md)
+    #     if support < 1:
+    #         print('delete by support')
+    #         minimal_vio.remove(current_md)
+    #     if confidence < 0.8:
+    #         print('delete by confidence')
+    #         minimal_vio.remove(current_md)
+    #     if support >= 1 and confidence >= 0.8:
+    #         i += 1
 
-
-    t1 = time.time()
     for _ in minimal_vio[:]:
         if not if_minimal(_, minimal_vio, target_col):
             minimal_vio.remove(_)
-    print(time.time() - t1)
+
+    print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
 
     return md_list, minimal_vio
 
 
+def get_metrics(current_md, data, sim_tensor, target_col, target_index):
+    columns = data.columns.values.tolist()
+    length = data.shape[0]
+    width = data.shape[1]
+
+    md_tensor = list(current_md.values())
+    md_tensor = torch.tensor(md_tensor, device='cuda')
+    md_tensor_2d = md_tensor.unsqueeze(1)
+    md_tensor_3d = md_tensor_2d.unsqueeze(2)
+    md_tensor_3d = md_tensor_3d.repeat(1, length, length)
+
+    sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
+    ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
+    for i in range(0, width):
+        if i != target_index:
+            sup_tensor_slice = sup_tensor[i]
+            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
+    sup_tensor_int = ini_slice.int()
+    support = torch.count_nonzero(sup_tensor_int).item()
+
+    ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
+    conf_tensor_int = ini_slice.int()
+    confidence_numerator = torch.count_nonzero(conf_tensor_int).item()
+    confidence = confidence_numerator / support
+
+    return support, confidence
+
+
 def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
     data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
     data.fillna("", inplace=True)
diff --git a/samples_generator.py b/samples_generator.py
deleted file mode 100644
index 0b15a09..0000000
--- a/samples_generator.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import os
-import random
-
-import pandas as pd
-import Levenshtein
-
-import ml_er.ml_entity_resolver
-
-
-def my_Levenshtein_ratio(str1, str2):
-    if max(len(str1), len(str2)) == 0:
-        return 1
-    return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
-
-
-def load_mds(paths: list) -> list:
-    if len(paths) == 0:
-        return []
-    all_mds = []
-    # 传入md路径列表
-    for md_path in paths:
-        if not os.path.exists(md_path):
-            continue
-        mds = []
-        # 打开每一个md文件
-        with open(md_path, 'r') as f:
-            # 读取每一行的md，加入该文件的md列表
-            for line in f.readlines():
-                md_metadata = line.strip().split('\t')
-                md = eval(md_metadata[0].replace('md:', ''))
-                confidence = eval(md_metadata[2].replace('confidence:', ''))
-                if confidence > 0:
-                    mds.append(md)
-        all_mds.extend(mds)
-    return all_mds
-
-
-# 输入: md地址列表/预测表地址/随机生成次数
-# 输出: 一些正样本(带gold列不带prediction列)
-def generate_samples(md_path_list, pred_path, count: int):
-    all_mds = load_mds(md_path_list)
-
-    predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1')
-    predictions.fillna("", inplace=True)
-    predictions = predictions.astype(str)
-    pred_attrs = predictions.columns.values.tolist()  # 预测表中的字段,带前缀,包括gold和predict
-    attrs = []  # 不带前缀的字段,不包括gold和predict
-    l_attrs = []
-    r_attrs = []
-    for _ in pred_attrs:
-        if _.startswith('ltable_'):
-            attrs.append(_.replace('ltable_', ''))
-            l_attrs.append(_)
-        elif _.startswith('rtable'):
-            r_attrs.append(_)
-
-    fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')]
-    fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')]
-
-    fpl = fp[l_attrs]
-    fpr = fp[r_attrs]
-    # 将左右两部分字段名统一
-    fpl.columns = attrs
-    fpr.columns = attrs
-    fnl = fn[l_attrs]
-    fnr = fn[r_attrs]
-    fnl.columns = attrs
-    fnr.columns = attrs
-    fp = pd.concat([fpl, fpr])
-    fn = pd.concat([fnl, fnr])
-    df = pd.concat([fp, fn])
-    length = len(df)
-
-    result = pd.DataFrame()
-    for i in range(0, count):
-        dic = {}
-        for _ in attrs:
-            if _ == 'id':
-                index = random.randint(0, length-1)
-                value = df.iloc[index]['id']
-                dic['ltable_'+_] = value
-                dic['rtable_'+_] = value
-            else:
-                index1 = random.randint(0, length-1)
-                index2 = random.randint(0, length-1)
-                value1 = df.iloc[index1][_]
-                value2 = df.iloc[index2][_]
-                dic['ltable_'+_] = value1
-                dic['rtable_'+_] = value2
-
-        for md in all_mds:
-            satis = True
-            for _ in attrs:
-                if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]:
-                    satis = False
-                    break
-            if satis:
-                series = pd.Series(dic)
-                result = result._append(series, ignore_index=True)
-    result['gold'] = 1
-    return result
-
-    # 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空)
-
-
-if __name__ == '__main__':
-    md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt',
-                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt',
-                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt',
-                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt']
-
-    pre_p = '/home/w/pred.csv'
-    generate_samples(md_paths, pre_p, 10000)
-    # 随机生成次数写个一千一万都没问题
diff --git a/tfile.py b/tfile.py
index f6005f7..d85cc0c 100644
--- a/tfile.py
+++ b/tfile.py
@@ -8,8 +8,60 @@ from tqdm import tqdm
 from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs
 from md_discovery import tmp_discover
 from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict
+
+
 def fuck(i):
-    i = i*i+1
+    i = i * i + 1
+
+
+def test1():
+    li = [[[6, 6, 2],
+           [2, 4, 6],
+           [2, 4, 7],
+           [3, 6, 4]],
+          [[6, 2, 7],
+           [3, 2, 4],
+           [5, 3, 5],
+           [6, 2, 4]],
+          [[7, 2, 2],
+           [6, 3, 2],
+           [6, 4, 3],
+           [6, 5, 6]]]
+    tensor = torch.Tensor(li)
+    norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
+    print(norm_tensor, '\n')
+    sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
+    print(sim_ten / 2 + 0.5, '\n')
+    print(sim_ten.size())
+
+
+def test2():
+    multiprocessing.set_start_method("spawn")
+    manager = multiprocessing.Manager()
+    lock = manager.Lock()
+    pool = multiprocessing.Pool(16)
+    with manager:
+        for _ in tqdm(range(0, 1000)):
+            result = pool.apply_async(fuck, args=(_,))
+            print(result)
+
+
+def test3():
+    dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
+    ll = list(dic.values())
+    ten = torch.Tensor(ll)
+    t = ten.unsqueeze(1)
+    t = t.unsqueeze(2)
+    y = t.repeat(1, 742, 742)
+    print(ten)
+    print(y)
+    print(torch.isfinite(ten))
+    print(torch.count_nonzero(y).item())
+
+
+def test4():
+    one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
+    print(torch.count_nonzero(one_bool_tensor).item())
 
 
 if __name__ == '__main__':
@@ -17,32 +69,4 @@ if __name__ == '__main__':
     tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv"
     # tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
     tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
-    print(time.time()-start)
-
-    # li = [[[6, 6, 2],
-    #        [2, 4, 6],
-    #        [2, 4, 7],
-    #        [3, 6, 4]],
-    #       [[6, 2, 7],
-    #        [3, 2, 4],
-    #        [5, 3, 5],
-    #        [6, 2, 4]],
-    #       [[7, 2, 2],
-    #        [6, 3, 2],
-    #        [6, 4, 3],
-    #        [6, 5, 6]]]
-    # tensor = torch.Tensor(li)
-    # norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
-    # print(norm_tensor, '\n')
-    # sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
-    # print(sim_ten/2 + 0.5, '\n')
-    # print(sim_ten.size())
-
-    # multiprocessing.set_start_method("spawn")
-    # manager = multiprocessing.Manager()
-    # lock = manager.Lock()
-    # pool = multiprocessing.Pool(16)
-    # with manager:
-    #     for _ in tqdm(range(0, 1000)):
-    #         result = pool.apply_async(fuck, args=(_,))
-    #         print(result)
+    print(time.time() - start)