利用GPU算metrics

2 years ago · 20c33c0fd8
parent 1ccfc9abd6
commit 20c33c0fd8
5 changed files with 149 additions and 201 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +0,0 @@
 # matching_dependency
--- a/md_discovery/multi_process_infer_by_pairs.py
+++ b/md_discovery/multi_process_infer_by_pairs.py
@ -7,7 +7,7 @@ import time
 import torch
 from tqdm import tqdm
 from transformers import AutoTokenizer, AutoModel
-from settings import model, embedding_dict
+from settings import model, embedding_dict, er_output_dir
 from sentence_transformers.util import cos_sim
 conf_thresh = 0.8
@ -38,12 +38,13 @@ def table_encode(tp_path, fn_path):
        for col in range(0, tp_width):
            cell_value = tp_data.values[row, col]
            tp_sentences.append(cell_value)
-    tp_embedding = model.encode(tp_sentences, convert_to_tensor=True)
+    tp_embedding = model.encode(tp_sentences, convert_to_tensor=True, device="cuda")
    list_tp_embedding = tp_embedding.tolist()
    for row in range(0, tp_length):
        for col in range(0, tp_width):
            cell_value = tp_data.values[row, col]
-            embedding_dic[cell_value] = tp_embedding.tolist()[row * tp_width + col]
+            embedding_dic[cell_value] = list_tp_embedding[row * tp_width + col]
    fn_data = pd.read_csv(fn_path, low_memory=False, encoding='ISO-8859-1')
@ -57,19 +58,20 @@ def table_encode(tp_path, fn_path):
        for col in range(0, fn_width):
            cell_value = fn_data.values[row, col]
            fn_sentences.append(cell_value)
-    fn_embedding = model.encode(fn_sentences, convert_to_tensor=True)
+    fn_embedding = model.encode(fn_sentences, convert_to_tensor=True, device="cuda")
    list_fn_embedding = fn_embedding.tolist()
    for row in range(0, fn_length):
        for col in range(0, fn_width):
            cell_value = fn_data.values[row, col]
-            embedding_dic[cell_value] = fn_embedding.tolist()[row * fn_width + col]
+            embedding_dic[cell_value] = list_fn_embedding[row * fn_width + col]
    np.save('embedding_dic.npy', embedding_dic)
 def test_table_encode():
    start = time.time()
-    table_encode('../ml_er/output/tp_single_tuple.csv', '../ml_er/output/fn_single_tuple.csv')
+    table_encode(er_output_dir+'tp_single_tuple.csv', er_output_dir+'fn_single_tuple.csv')
    print(time.time()-start)
@ -133,6 +135,7 @@ def inference_from_record_pairs(path, threshold, target_col):
    data.fillna("", inplace=True)
    data = data.astype(str)
    columns = data.columns.values.tolist()
    other_columns = list(set(columns) - {target_col})
    md_list = []
    minimal_vio = []
@ -141,7 +144,7 @@ def inference_from_record_pairs(path, threshold, target_col):
        init_md[col] = 1 if col == target_col else 0
    md_list.append(init_md)
-    for row1 in data.itertuples():
+    for row1 in tqdm(data.itertuples()):
        # 获取当前行的索引，从后一行开始切片
        i = row1[0]
        data1 = data[i + 1:]
@ -158,7 +161,7 @@ def inference_from_record_pairs(path, threshold, target_col):
            for md in md_list[:]:
                lhs_satis = True
                rhs_satis = True
-                for col in list(set(columns) - {target_col}):
+                for col in other_columns:
                    if sims[col] + 0.0000001 < md[col]:
                        lhs_satis = False
                        break
@ -167,7 +170,7 @@ def inference_from_record_pairs(path, threshold, target_col):
                if lhs_satis == True and rhs_satis == False:
                    md_list.remove(md)
                    violated_mds.append(md)
-            minimal_vio.extend(violated_mds)
+            # minimal_vio.extend(violated_mds)
            for vio_md in violated_mds:
                # 特殊化右侧,我们需要右侧百分百相似，其实不需要降低右侧阈值
@ -179,16 +182,16 @@ def inference_from_record_pairs(path, threshold, target_col):
                #         md_list.append(spec_r_md)
                # 特殊化左侧
-                for col in list(set(columns) - {target_col}):
+                for col in other_columns:
                    if sims[col] + 0.01 <= 1:
                        spec_l_md = copy.deepcopy(vio_md)
                        spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
                        if if_minimal(spec_l_md, md_list, target_col):
                            md_list.append(spec_l_md)
-            for vio in minimal_vio[:]:
+            # for vio in minimal_vio[:]:
-                if not if_minimal(vio, md_list, target_col):
+            #     if not if_minimal(vio, md_list, target_col):
-                    minimal_vio.remove(vio)
+            #         minimal_vio.remove(vio)
    # fuck = len(minimal_vio)
    # tmp = []
@ -216,9 +219,9 @@ def inference_from_record_pairs(path, threshold, target_col):
    #     if not if_minimal(_, minimal_vio, target_col):
    #         minimal_vio.remove(_)
    #
-    # for _ in md_list[:]:
+    for _ in md_list[:]:
-    #     if not if_minimal(_, md_list, target_col):
+        if not if_minimal(_, md_list, target_col):
-    #         md_list.remove(_)
+            md_list.remove(_)
    return md_list, minimal_vio
--- a/md_discovery/tmp_discover.py
+++ b/md_discovery/tmp_discover.py
@ -70,6 +70,7 @@ def inference_from_record_pairs(path, threshold, target_col):
    data.fillna("", inplace=True)
    data = data.astype(str)
    columns = data.columns.values.tolist()
    target_index = columns.index(target_col)
    cols_but_target = list(set(columns) - {target_col})
    length = data.shape[0]
    width = data.shape[1]
@ -95,7 +96,6 @@ def inference_from_record_pairs(path, threshold, target_col):
        init_md[col] = 1 if col == target_col else 0
    md_list.append(init_md)
    start = time.time()
    for row1 in range(0, length - 1):
        terminate = False
        for row2 in range(row1 + 1, length):
@ -132,9 +132,9 @@ def inference_from_record_pairs(path, threshold, target_col):
                #         md_list.append(spec_r_md)
                # 特殊化左侧
                for col in list(set(columns) - {target_col}):
-                    if sims[col] + 0.05 <= 1:
+                    if sims[col] + 0.01 <= 1:
                        spec_l_md = copy.deepcopy(vio_md)
-                        spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.05
+                        spec_l_md[col] = threshold if sims[col] < threshold else sims[col] + 0.01
                        if if_minimal(spec_l_md, md_list, target_col):
                            md_list.append(spec_l_md)
                if vio_md not in minimal_vio:
@ -151,9 +151,8 @@ def inference_from_record_pairs(path, threshold, target_col):
            if not if_minimal(vio, md_list, target_col):
                minimal_vio.remove(vio)
-    print(time.time()-start, '\n')
+    print('mds_list\t', len(md_list), '\n')
-    print(len(md_list), '\n')
+    print('vio_list\t', len(minimal_vio), '\n')
    print(len(minimal_vio), '\n')
    if len(minimal_vio) == 0:
        return md_list, []
@ -170,49 +169,87 @@ def inference_from_record_pairs(path, threshold, target_col):
    #     pool.join()
    #     minimal_vio = list(proxy_minimal_vio)
-    start = time.time()
+    # minimal_vio.reverse()
    minimal_vio.reverse()
    i = 0
-    while i < len(minimal_vio):
+    remove_list = []
-        print(i)
+    fuck = []
-        print(len(minimal_vio))
+    for md in minimal_vio:
-        current_md = minimal_vio[i]
+        support, confidence = get_metrics(md, data, sim_tensor, target_col, target_index)
-        support, confidence = get_one_md_metadata(current_md, data, sim_tensor, target_col)
+        fuck.append((support, confidence))
-        if support < 50:
+        if support < 1:
-            minimal_vio_length = len(minimal_vio)
+            print('delete by support')
-            j = i + 1
+            remove_list.append(md)
            while j < len(minimal_vio):
                specialization = True
                next_md = minimal_vio[j]
                for col in cols_but_target:
                    if current_md[col] > next_md[col]:
                        specialization = False
                        break
                if specialization:
                    minimal_vio.remove(next_md)
                else:
                    j += 1
            print('sup')
            minimal_vio.remove(current_md)
        if confidence < 0.8:
-            print('conf')
+            print('delete by confidence')
-            minimal_vio.remove(current_md)
+            remove_list.append(md)
-        if support >= 50 and confidence >= 0.8:
+    fuck_me = sorted(fuck, key=lambda x: x[1], reverse=True)
-            i += 1
+    # while i < len(minimal_vio):
-    print(time.time()-start)
+    #     print('vio_index\t', i)
-
+    #     print('vio_length', len(minimal_vio))
-
+    #     current_md = minimal_vio[i]
    #     support, confidence = get_metrics(current_md, data, sim_tensor, target_col, target_index)
    #     # if support < 50:
    #     #     minimal_vio_length = len(minimal_vio)
    #     #     j = i + 1
    #     #     while j < len(minimal_vio):
    #     #         specialization = True
    #     #         next_md = minimal_vio[j]
    #     #         for col in cols_but_target:
    #     #             if current_md[col] > next_md[col]:
    #     #                 specialization = False
    #     #                 break
    #     #         if specialization:
    #     #             minimal_vio.remove(next_md)
    #     #         else:
    #     #             j += 1
    #     #     print('sup')
    #     #     minimal_vio.remove(current_md)
    #     if support < 1:
    #         print('delete by support')
    #         minimal_vio.remove(current_md)
    #     if confidence < 0.8:
    #         print('delete by confidence')
    #         minimal_vio.remove(current_md)
    #     if support >= 1 and confidence >= 0.8:
    #         i += 1
    t1 = time.time()
    for _ in minimal_vio[:]:
        if not if_minimal(_, minimal_vio, target_col):
            minimal_vio.remove(_)
-    print(time.time() - t1)
+
    print('\033[31m' + 'vio_length\t' + str(len(minimal_vio)) + '\033[0m')
    return md_list, minimal_vio
 def get_metrics(current_md, data, sim_tensor, target_col, target_index):
    columns = data.columns.values.tolist()
    length = data.shape[0]
    width = data.shape[1]
    md_tensor = list(current_md.values())
    md_tensor = torch.tensor(md_tensor, device='cuda')
    md_tensor_2d = md_tensor.unsqueeze(1)
    md_tensor_3d = md_tensor_2d.unsqueeze(2)
    md_tensor_3d = md_tensor_3d.repeat(1, length, length)
    sup_tensor = torch.ge(sim_tensor, md_tensor_3d)
    ini_slice = torch.ones((length, length), dtype=torch.bool, device='cuda')
    for i in range(0, width):
        if i != target_index:
            sup_tensor_slice = sup_tensor[i]
            ini_slice = torch.logical_and(ini_slice, sup_tensor_slice)
    sup_tensor_int = ini_slice.int()
    support = torch.count_nonzero(sup_tensor_int).item()
    ini_slice = torch.logical_and(ini_slice, sup_tensor[target_index])
    conf_tensor_int = ini_slice.int()
    confidence_numerator = torch.count_nonzero(conf_tensor_int).item()
    confidence = confidence_numerator / support
    return support, confidence
 def get_mds_metadata(md_list, dataset_path, sim_tensor, target_col):
    data = pd.read_csv(dataset_path, low_memory=False, encoding='ISO-8859-1')
    data.fillna("", inplace=True)
--- a/samples_generator.py
+++ b/samples_generator.py
@ -1,114 +0,0 @@
 import os
 import random
 import pandas as pd
 import Levenshtein
 import ml_er.ml_entity_resolver
 def my_Levenshtein_ratio(str1, str2):
    if max(len(str1), len(str2)) == 0:
        return 1
    return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))
 def load_mds(paths: list) -> list:
    if len(paths) == 0:
        return []
    all_mds = []
    # 传入md路径列表
    for md_path in paths:
        if not os.path.exists(md_path):
            continue
        mds = []
        # 打开每一个md文件
        with open(md_path, 'r') as f:
            # 读取每一行的md，加入该文件的md列表
            for line in f.readlines():
                md_metadata = line.strip().split('\t')
                md = eval(md_metadata[0].replace('md:', ''))
                confidence = eval(md_metadata[2].replace('confidence:', ''))
                if confidence > 0:
                    mds.append(md)
        all_mds.extend(mds)
    return all_mds
 # 输入: md地址列表/预测表地址/随机生成次数
 # 输出: 一些正样本(带gold列不带prediction列)
 def generate_samples(md_path_list, pred_path, count: int):
    all_mds = load_mds(md_path_list)
    predictions = pd.read_csv(pred_path, low_memory=False, encoding='ISO-8859-1')
    predictions.fillna("", inplace=True)
    predictions = predictions.astype(str)
    pred_attrs = predictions.columns.values.tolist()  # 预测表中的字段,带前缀,包括gold和predict
    attrs = []  # 不带前缀的字段,不包括gold和predict
    l_attrs = []
    r_attrs = []
    for _ in pred_attrs:
        if _.startswith('ltable_'):
            attrs.append(_.replace('ltable_', ''))
            l_attrs.append(_)
        elif _.startswith('rtable'):
            r_attrs.append(_)
    fp = predictions[(predictions['gold'] == '0') & (predictions['predicted'] == '1')]
    fn = predictions[(predictions['gold'] == '1') & (predictions['predicted'] == '0')]
    fpl = fp[l_attrs]
    fpr = fp[r_attrs]
    # 将左右两部分字段名统一
    fpl.columns = attrs
    fpr.columns = attrs
    fnl = fn[l_attrs]
    fnr = fn[r_attrs]
    fnl.columns = attrs
    fnr.columns = attrs
    fp = pd.concat([fpl, fpr])
    fn = pd.concat([fnl, fnr])
    df = pd.concat([fp, fn])
    length = len(df)
    result = pd.DataFrame()
    for i in range(0, count):
        dic = {}
        for _ in attrs:
            if _ == 'id':
                index = random.randint(0, length-1)
                value = df.iloc[index]['id']
                dic['ltable_'+_] = value
                dic['rtable_'+_] = value
            else:
                index1 = random.randint(0, length-1)
                index2 = random.randint(0, length-1)
                value1 = df.iloc[index1][_]
                value2 = df.iloc[index2][_]
                dic['ltable_'+_] = value1
                dic['rtable_'+_] = value2
        for md in all_mds:
            satis = True
            for _ in attrs:
                if my_Levenshtein_ratio(str(dic['ltable_'+_]), str(dic['rtable_'+_])) < md[_]:
                    satis = False
                    break
            if satis:
                series = pd.Series(dic)
                result = result._append(series, ignore_index=True)
    result['gold'] = 1
    return result
    # 判断字典是否满足某条md,满足则转为series插入dataframe(初始为空)
 if __name__ == '__main__':
    md_paths = ['/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_mds.txt',
                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_mds.txt',
                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/tp_vio.txt',
                '/home/w/PycharmProjects/matching_dependency/md_discovery/output/fn_vio.txt']
    pre_p = '/home/w/pred.csv'
    generate_samples(md_paths, pre_p, 10000)
    # 随机生成次数写个一千一万都没问题
--- a/tfile.py
+++ b/tfile.py
@ -8,8 +8,60 @@ from tqdm import tqdm
 from md_discovery.multi_process_infer_by_pairs import table_encode, inference_from_record_pairs
 from md_discovery import tmp_discover
 from settings import er_output_dir, similarity_threshold, target_attr, embedding_dict
 def fuck(i):
-    i = i*i+1
+    i = i * i + 1
 def test1():
    li = [[[6, 6, 2],
           [2, 4, 6],
           [2, 4, 7],
           [3, 6, 4]],
          [[6, 2, 7],
           [3, 2, 4],
           [5, 3, 5],
           [6, 2, 4]],
          [[7, 2, 2],
           [6, 3, 2],
           [6, 4, 3],
           [6, 5, 6]]]
    tensor = torch.Tensor(li)
    norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
    print(norm_tensor, '\n')
    sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
    print(sim_ten / 2 + 0.5, '\n')
    print(sim_ten.size())
 def test2():
    multiprocessing.set_start_method("spawn")
    manager = multiprocessing.Manager()
    lock = manager.Lock()
    pool = multiprocessing.Pool(16)
    with manager:
        for _ in tqdm(range(0, 1000)):
            result = pool.apply_async(fuck, args=(_,))
            print(result)
 def test3():
    dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
    ll = list(dic.values())
    ten = torch.Tensor(ll)
    t = ten.unsqueeze(1)
    t = t.unsqueeze(2)
    y = t.repeat(1, 742, 742)
    print(ten)
    print(y)
    print(torch.isfinite(ten))
    print(torch.count_nonzero(y).item())
 def test4():
    one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
    print(torch.count_nonzero(one_bool_tensor).item())
 if __name__ == '__main__':
@ -17,32 +69,4 @@ if __name__ == '__main__':
    tp_single_tuple_path = er_output_dir + "tp_single_tuple.csv"
    # tp_mds, tp_vio = inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
    tp_mds, tp_vio = tmp_discover.inference_from_record_pairs(tp_single_tuple_path, similarity_threshold, target_attr)
-    print(time.time()-start)
+    print(time.time() - start)
    # li = [[[6, 6, 2],
    #        [2, 4, 6],
    #        [2, 4, 7],
    #        [3, 6, 4]],
    #       [[6, 2, 7],
    #        [3, 2, 4],
    #        [5, 3, 5],
    #        [6, 2, 4]],
    #       [[7, 2, 2],
    #        [6, 3, 2],
    #        [6, 4, 3],
    #        [6, 5, 6]]]
    # tensor = torch.Tensor(li)
    # norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
    # print(norm_tensor, '\n')
    # sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
    # print(sim_ten/2 + 0.5, '\n')
    # print(sim_ten.size())
    # multiprocessing.set_start_method("spawn")
    # manager = multiprocessing.Manager()
    # lock = manager.Lock()
    # pool = multiprocessing.Pool(16)
    # with manager:
    #     for _ in tqdm(range(0, 1000)):
    #         result = pool.apply_async(fuck, args=(_,))
    #         print(result)