commit

1 year ago · c1e07eabb6
parent 9b06ce3840
commit c1e07eabb6
10 changed files with 69 additions and 241 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,5 +6,3 @@ tfile.py
 table_embedding.py
 set_none.py
 generate_matches.py
 ml_er/fuck.py
--- a/draw/draw_f1_inter_bars.py
+++ b/draw/draw_f1_inter_bars.py
@ -39,6 +39,7 @@ if __name__ == '__main__':
        .set_global_opts(
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
            title_opts=opts.TitleOpts(title="各数据集F1值与预测结果可解释比例", subtitle="可解释性比重：0.5"),
            # yaxis_opts=opts.AxisOpts(min_=0.95)  # 设置y轴起始点
        )
        .render("output/F1_Inter_bars.html")
    )
--- a/generate_matches.py
+++ b/generate_matches.py
@ -1,21 +0,0 @@
 import csv
 import pandas as pd
 import json
 import sentence_transformers.util
 import torch
 from sentence_transformers import SentenceTransformer
 from torch import nn
 if __name__ == '__main__':
    directory = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM_dirty'
    train = pd.read_csv(directory + r'\train.csv', encoding='ISO-8859-1')
    valid = pd.read_csv(directory + r'\valid.csv', encoding='ISO-8859-1')
    test = pd.read_csv(directory + r'\test.csv', encoding='ISO-8859-1')
    train = train[train['label'] == 1]
    valid = valid[valid['label'] == 1]
    test = test[test['label'] == 1]
    matches = pd.concat([train, valid, test])
    matches.drop(columns=['label'], inplace=True)
    matches = matches.sort_values(by='ltable_id')
    matches.to_csv(directory + r'\matches.csv', sep=',', index=False, header=True)
--- a/hpo/magellan_hpo.py
+++ b/hpo/magellan_hpo.py
@ -2,7 +2,7 @@ import json
 import pickle
 from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Integer, Float
-from ConfigSpace.conditions import InCondition, EqualsCondition
+from ConfigSpace.conditions import InCondition, EqualsCondition, AndConjunction
 from ConfigSpace.read_and_write import json as csj
 import py_entitymatching.catalog.catalog_manager as cm
 import pandas as pd
@ -27,7 +27,7 @@ class Classifier:
        svm_kernel = Categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], default='rbf')
        svm_C = Integer('svm_C', (1, 100), default=1)
        svm_gamma = Categorical('svm_gamma', ['scale', 'auto'], default='scale')
-        svm_degree = Integer('svm_degree', (1, 5), default=3)
+        svm_degree = Integer('svm_degree', (1, 3), default=3)
        svm_constant = Float('svm_constant', (0.0, 5.0), default=0.0)
        dt_splitter = Categorical('dt_splitter', ["best", "random"], default='best')
@ -43,15 +43,20 @@ class Classifier:
        active_dt_splitter = EqualsCondition(child=dt_splitter, parent=ml_matcher, value="dt")
        active_dt_max_features = EqualsCondition(child=dt_max_features, parent=ml_matcher, value="dt")
        active_svm_kernel = EqualsCondition(child=svm_kernel, parent=ml_matcher, value="svm")
        active_svm_gamma = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
        active_svm_degree = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
        active_svm_constant = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
        active_svm_C = EqualsCondition(child=svm_C, parent=ml_matcher, value="svm")
-
+        active_svm_gamma1 = EqualsCondition(child=svm_gamma, parent=ml_matcher, value="svm")
-        cs.add_conditions([active_svm_C, active_svm_constant, active_svm_degree, active_svm_gamma, active_svm_kernel,
+        active_svm_degree1 = EqualsCondition(child=svm_degree, parent=ml_matcher, value="svm")
-                           active_dt_splitter, active_rf_n_estimators, active_dt_max_features, active_rf_max_features,
+        active_svm_constant1 = EqualsCondition(child=svm_constant, parent=ml_matcher, value="svm")
-                           active_tree_max_depth, active_tree_criterion])
+
-
+        active_svm_gamma2 = InCondition(child=svm_gamma, parent=svm_kernel, values=["rbf", "poly", "sigmoid"])
        active_svm_degree2 = EqualsCondition(child=svm_degree, parent=svm_kernel, value="poly")
        active_svm_constant2 = InCondition(child=svm_constant, parent=svm_kernel, values=["poly", "sigmoid"])
        cs.add_conditions([active_svm_C, active_svm_kernel, active_dt_splitter, active_rf_n_estimators,
                           active_dt_max_features, active_rf_max_features, active_tree_max_depth, active_tree_criterion,
                           AndConjunction(active_svm_gamma1, active_svm_gamma2),
                           AndConjunction(active_svm_degree1, active_svm_degree2),
                           AndConjunction(active_svm_constant1, active_svm_constant2)])
        return cs
    def train(self, config: Configuration, seed: int = 0) -> float:
@ -75,7 +80,7 @@ def ml_er_hpo():
        cs,
        crash_cost=1.0,
        deterministic=True,
-        n_trials=50,
+        n_trials=20,
        n_workers=1
    )
--- a/md_discovery/md_mining.py
+++ b/md_discovery/md_mining.py
@ -1,3 +1,4 @@
 import itertools
 import random
 import operator
 from operator import itemgetter
@ -91,6 +92,9 @@ def mining(train: pd.DataFrame):
    # result_list.sort(key=itemgetter(2), reverse=True)
    # 按confidence->support的优先级排序
    result_list.sort(key=itemgetter(2, 1), reverse=True)
    result_list = merge_mds(result_list)
    result_list.sort(key=itemgetter(2, 1), reverse=True)
    # 保存到本地
    mds_to_txt(result_list)
    return result_list
@ -210,3 +214,42 @@ def mds_to_txt(result_list_):
        for _ in result_list_:
            f.write(f'MD: {str(_[0])}\tAbsolute Support: {str(_[1])}\tConfidence: {str(_[2])}')
            f.write('\n')
 # 合并一些MD
 def merge_mds(md_list_):
    # 创建一个空字典用于分组
    grouped_md_tuples = {}
    # 遍历三元组并对它们进行分组
    for md_tuple in md_list_:
        # 提取Support和Confidence的值作为字典的键
        key = (md_tuple[1], md_tuple[2])
        # 检查键是否已经存在于分组字典中
        if key in grouped_md_tuples:
            # 如果存在，将三元组添加到对应的列表中
            grouped_md_tuples[key].append(md_tuple)
        else:
            # 如果不存在，创建一个新的键值对
            grouped_md_tuples[key] = [md_tuple]
    # 不要键只要值
    # 一个二级列表, 每个子列表中MD tuple的support和confidence一样
    grouped_md_tuples = list(grouped_md_tuples.values())
    for same_sc_list in grouped_md_tuples:
        # 创建一个索引列表，用于标记需要删除的元组
        indices_to_remove = []
        # 获取元组列表的长度
        length = len(same_sc_list)
        # 遍历元组列表，进行比较和删除操作
        for i in range(length):
            for j in range(length):
                # 比较两个元组的字典值
                if i != j and all(same_sc_list[i][0][key_] >= same_sc_list[j][0][key_] for key_ in same_sc_list[i][0]):
                    # 如果同组内一个MD的所有相似度阈值都大于等于另一个MD, 则前者可以删除
                    indices_to_remove.append(i)
                    break  # 由于列表大小会变化，跳出内层循环
        # 根据索引列表逆序删除元组，以避免在删除时改变列表大小
        for index in sorted(indices_to_remove, reverse=True):
            del same_sc_list[index]
    # 二级列表转一级列表
    return list(itertools.chain.from_iterable(grouped_md_tuples))
--- a/ml_er/magellan_new.py
+++ b/ml_er/magellan_new.py
@ -34,7 +34,7 @@ def blocking_mining():
    blocker = em.OverlapBlocker()
    candidate = blocker.block_tables(ltable, rtable, ltable_block_attr, rtable_block_attr, allow_missing=True,
-                                     l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=-1,
+                                     l_output_attrs=attributes, r_output_attrs=attributes, n_jobs=1,
                                     overlap_size=1, show_progress=False)
    candidate['gold'] = 0
    candidate = candidate.reset_index(drop=True)
@ -52,6 +52,7 @@ def blocking_mining():
    candidate.loc[match_rows_mask, 'gold'] = 1
    candidate.fillna(value="", inplace=True)
    # negative样本太多, 采样三倍于positive样本量
    candidate_mismatch = candidate[candidate['gold'] == 0]
    candidate_match = candidate[candidate['gold'] == 1]
    candidate_mismatch = candidate_mismatch.sample(n=3*len(candidate_match))
--- a/set_none.py
+++ b/set_none.py
@ -1,27 +0,0 @@
 import numpy as np
 import pandas as pd
 if __name__ == '__main__':
    lt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv',
                     encoding='ISO-8859-1', sep=',')
    rt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv',
                     encoding='ISO-8859-1', sep=',')
    mapping = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
                          encoding='ISO-8859-1', sep=',')
    # 删除列
    # lt.drop(columns=['numberOfSeasons', 'numberOfEpisodes', 'birthDate', 'last_air_date', 'release_year', 'runtime',
    #                  'genre_list', 'origin_country'], inplace=True)
    # rt.drop(columns=['job'], inplace=True)
    # 列名调整
    # rt = rt.rename(columns={'birthDate': 'birthYear', 'release_year': 'startYear',
    #                         'last_air_date': 'endYear', 'runtime': 'runtimeMinutes'})
    # 列顺序调整
    # rt = rt[['id', 'title', 'name', 'episodeNumber', 'seasonNumber', 'birthYear', 'endYear', 'startYear', 'genre_list',
    #         'runtimeMinutes']]
    # lt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv', sep=',', index=False, header=True, quoting=1)
    # rt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv', sep=',', index=False, header=True, quoting=1)
    # mapping.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
    #                sep=',', index=False, header=True, quoting=1)
    print(1)
--- a/settings.py
+++ b/settings.py
@ -1,8 +1,8 @@
 from sentence_transformers import SentenceTransformer
-ltable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableA.csv'
+ltable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableA.csv'
-rtable_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\tableB.csv'
+rtable_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\tableB.csv'
-mapping_path = r'E:\Data\Research\Projects\matching_dependency\datasets\Abt-Buy\matches.csv'
+mapping_path = r'E:\Data\Research\Projects\md_bayesian_er_magellan\datasets\Abt-Buy\matches.csv'
 mapping_lid = 'idAbt'  # mapping表中左表id名
 mapping_rid = 'idBuy'  # mapping表中右表id名
 ltable_block_attr = 'name'
@ -13,12 +13,12 @@ target_attr = 'id'  # 进行md挖掘时的目标字段
 # lr_attrs_map = {}  # 如果两个表中存在对应字段名称不一样的情况，将名称加入列表便于调整一致
 model = SentenceTransformer('E:\\Data\\Research\\Models\\all-MiniLM-L6-v2')
-interpre_weight = 1  # 可解释性权重
+interpre_weight = 0  # 可解释性权重
 similarity_threshold = 0.1
 support_threshold = 1
 confidence_threshold = 0.75
-er_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\ml_er\\output\\'
+er_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\ml_er\\output\\'
-md_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\md_discovery\\output\\'
+md_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\md_discovery\\output\\'
-hpo_output_dir = 'E:\\Data\\Research\\Projects\\matching_dependency\\hpo\\output\\'
+hpo_output_dir = 'E:\\Data\\Research\\Projects\\md_bayesian_er_magellan\\hpo\\output\\'
--- a/table_embedding.py
+++ b/table_embedding.py
@ -1,13 +0,0 @@
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
 if __name__ == '__main__':
    path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
    table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
    tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
    model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
    encoding = tokenizer(table, return_tensors="pt")
    embedding = tokenizer.encode(table, return_tensors="pt")
    print(1)
--- a/tfile.py
+++ b/tfile.py
@ -1,159 +0,0 @@
 import json
 import multiprocessing
 import os
 import time
 import ConfigSpace
 import numpy as np
 import pandas as pd
 import torch
 from tqdm import tqdm
 from ConfigSpace.read_and_write import json as csj
 from md_discovery import discovery_executor
 from settings import er_output_dir, hpo_output_dir
 def fuck(i):
    i = i * i + 1
 def test1():
    li = [[[6, 6, 2],
           [2, 4, 6],
           [2, 4, 7],
           [3, 6, 4]],
          [[6, 2, 7],
           [3, 2, 4],
           [5, 3, 5],
           [6, 2, 4]],
          [[7, 2, 2],
           [6, 3, 2],
           [6, 4, 3],
           [6, 5, 6]]]
    tensor = torch.Tensor(li)
    norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
    print(norm_tensor, '\n')
    sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
    print(sim_ten / 2 + 0.5, '\n')
    print(sim_ten.size())
 def test2():
    multiprocessing.set_start_method("spawn")
    manager = multiprocessing.Manager()
    lock = manager.Lock()
    pool = multiprocessing.Pool(16)
    with manager:
        for _ in tqdm(range(0, 1000)):
            result = pool.apply_async(fuck, args=(_,))
            print(result)
 def test3():
    dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
    ll = list(dic.values())
    ten = torch.Tensor(ll)
    t = ten.unsqueeze(1)
    t = t.unsqueeze(2)
    y = t.repeat(1, 742, 742)
    print(ten)
    print(y)
    print(torch.isfinite(ten))
    print(torch.count_nonzero(y).item())
 def test4():
    one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
    print(torch.count_nonzero(one_bool_tensor).item())
 def test5():
    ten1 = torch.tensor([[1, 2, 3],
                         [7, 8, 9]])
    ten2 = torch.tensor([[4, 5, 6],
                         [11, 12, 15]])
    result = ten1 * ten2
    r = torch.sum(result, 1)
    print('\n')
    print(result)
    print(r)
 def test6():
    table_tensor = torch.tensor([[[1., 2., 3.],
                                  [4., 5., 6.],
                                  [7., 8., 9.]],
                                 [[1., 2., 3.],
                                  [4., 5., 6.],
                                  [7., 8., 9.]]])
    t = torch.tensor([[1., 2., 3.],
                      [4., 5., 6.]])
    norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
    norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
    print('\n')
    print(norm1)
    print(norm2)
    print(t.shape)
 def test7():
    iterations = 1
    filename_list = os.listdir(er_output_dir)
    if len(filename_list) > 0:
        for _ in filename_list:
            if _.startswith('eval_result'):
                iterations = int(_[12:13]) + 1
    print(iterations)
 def test8():
    with open(hpo_output_dir + "configspace.json", 'r') as load_f:
        dict_configspace = json.load(load_f)
    str_configspace = json.dumps(dict_configspace)
    configspace = csj.read(str_configspace)
 def test9():
    df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True)
    df.to_csv(r'./datasets/s.csv')
    d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1')
    print(1)
 def test10():
    rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\ml_er\output\predictions.csv',
                         encoding='ISO-8859-1')
    print(1)
    rtable.columns = ["id", "title", "authors", "venue", "year"]
    rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv',
                  sep=',', index=False, header=True, quoting=1)
 def test11():
    values = {
        'block_attr': 'class',
        'confidence_thresh': 0.2717823249253852,
        'ml_blocker': 'attr_equiv',
        'ml_matcher': 'ln',
        'similarity_thresh': 0.20681820299103484,
        'support_thresh': 129,
    }
    with open(hpo_output_dir + "incumbent.json", "w") as f:
        json.dump(values, f, indent=4)
 def test12():
    with open(hpo_output_dir + "incumbent.json", 'r') as f:
        dic = json.load(f)
    for _ in dic.keys():
        print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}')
 def test13():
    outcome_dir = r'E:\Data\Research\Outcome'
    configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
    datasets_list = os.listdir(outcome_dir)
    f = []
    for _ in datasets_list:
        f.append(outcome_dir + rf'\{_}' + configs_dir)
    print(f)