parent
							
								
									9b06ce3840
								
							
						
					
					
						commit
						c1e07eabb6
					
				@ -1,21 +0,0 @@
 | 
				
			||||
import csv
 | 
				
			||||
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import json
 | 
				
			||||
import sentence_transformers.util
 | 
				
			||||
import torch
 | 
				
			||||
from sentence_transformers import SentenceTransformer
 | 
				
			||||
from torch import nn
 | 
				
			||||
 | 
				
			||||
if __name__ == '__main__':
 | 
				
			||||
    directory = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM_dirty'
 | 
				
			||||
    train = pd.read_csv(directory + r'\train.csv', encoding='ISO-8859-1')
 | 
				
			||||
    valid = pd.read_csv(directory + r'\valid.csv', encoding='ISO-8859-1')
 | 
				
			||||
    test = pd.read_csv(directory + r'\test.csv', encoding='ISO-8859-1')
 | 
				
			||||
    train = train[train['label'] == 1]
 | 
				
			||||
    valid = valid[valid['label'] == 1]
 | 
				
			||||
    test = test[test['label'] == 1]
 | 
				
			||||
    matches = pd.concat([train, valid, test])
 | 
				
			||||
    matches.drop(columns=['label'], inplace=True)
 | 
				
			||||
    matches = matches.sort_values(by='ltable_id')
 | 
				
			||||
    matches.to_csv(directory + r'\matches.csv', sep=',', index=False, header=True)
 | 
				
			||||
@ -1,27 +0,0 @@
 | 
				
			||||
import numpy as np
 | 
				
			||||
import pandas as pd
 | 
				
			||||
 | 
				
			||||
if __name__ == '__main__':
 | 
				
			||||
    lt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv',
 | 
				
			||||
                     encoding='ISO-8859-1', sep=',')
 | 
				
			||||
    rt = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv',
 | 
				
			||||
                     encoding='ISO-8859-1', sep=',')
 | 
				
			||||
    mapping = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
 | 
				
			||||
                          encoding='ISO-8859-1', sep=',')
 | 
				
			||||
    # 删除列
 | 
				
			||||
    # lt.drop(columns=['numberOfSeasons', 'numberOfEpisodes', 'birthDate', 'last_air_date', 'release_year', 'runtime',
 | 
				
			||||
    #                  'genre_list', 'origin_country'], inplace=True)
 | 
				
			||||
    # rt.drop(columns=['job'], inplace=True)
 | 
				
			||||
    # 列名调整
 | 
				
			||||
    # rt = rt.rename(columns={'birthDate': 'birthYear', 'release_year': 'startYear',
 | 
				
			||||
    #                         'last_air_date': 'endYear', 'runtime': 'runtimeMinutes'})
 | 
				
			||||
    # 列顺序调整
 | 
				
			||||
    # rt = rt[['id', 'title', 'name', 'episodeNumber', 'seasonNumber', 'birthYear', 'endYear', 'startYear', 'genre_list',
 | 
				
			||||
    #         'runtimeMinutes']]
 | 
				
			||||
    # lt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tmdb.csv', sep=',', index=False, header=True, quoting=1)
 | 
				
			||||
    # rt.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\tvdb.csv', sep=',', index=False, header=True, quoting=1)
 | 
				
			||||
    # mapping.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\TMDB-TVDB\gt.csv',
 | 
				
			||||
    #                sep=',', index=False, header=True, quoting=1)
 | 
				
			||||
    print(1)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
@ -1,13 +0,0 @@
 | 
				
			||||
import pandas as pd
 | 
				
			||||
from sentence_transformers import SentenceTransformer
 | 
				
			||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTableQuestionAnswering
 | 
				
			||||
 | 
				
			||||
if __name__ == '__main__':
 | 
				
			||||
    path = r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-ACM\tableA.csv'
 | 
				
			||||
    table = pd.read_csv(path, low_memory=False, encoding='ISO-8859-1')
 | 
				
			||||
 | 
				
			||||
    tokenizer = AutoTokenizer.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
 | 
				
			||||
    model = AutoModelForTableQuestionAnswering.from_pretrained(r'E:\Data\Research\Models\tapas-large-finetuned-wtq')
 | 
				
			||||
    encoding = tokenizer(table, return_tensors="pt")
 | 
				
			||||
    embedding = tokenizer.encode(table, return_tensors="pt")
 | 
				
			||||
    print(1)
 | 
				
			||||
@ -1,159 +0,0 @@
 | 
				
			||||
import json
 | 
				
			||||
import multiprocessing
 | 
				
			||||
import os
 | 
				
			||||
import time
 | 
				
			||||
 | 
				
			||||
import ConfigSpace
 | 
				
			||||
import numpy as np
 | 
				
			||||
import pandas as pd
 | 
				
			||||
import torch
 | 
				
			||||
from tqdm import tqdm
 | 
				
			||||
from ConfigSpace.read_and_write import json as csj
 | 
				
			||||
from md_discovery import discovery_executor
 | 
				
			||||
from settings import er_output_dir, hpo_output_dir
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def fuck(i):
 | 
				
			||||
    i = i * i + 1
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test1():
 | 
				
			||||
    li = [[[6, 6, 2],
 | 
				
			||||
           [2, 4, 6],
 | 
				
			||||
           [2, 4, 7],
 | 
				
			||||
           [3, 6, 4]],
 | 
				
			||||
          [[6, 2, 7],
 | 
				
			||||
           [3, 2, 4],
 | 
				
			||||
           [5, 3, 5],
 | 
				
			||||
           [6, 2, 4]],
 | 
				
			||||
          [[7, 2, 2],
 | 
				
			||||
           [6, 3, 2],
 | 
				
			||||
           [6, 4, 3],
 | 
				
			||||
           [6, 5, 6]]]
 | 
				
			||||
    tensor = torch.Tensor(li)
 | 
				
			||||
    norm_tensor = torch.nn.functional.normalize(tensor, dim=2)
 | 
				
			||||
    print(norm_tensor, '\n')
 | 
				
			||||
    sim_ten = torch.matmul(norm_tensor, norm_tensor.transpose(1, 2))
 | 
				
			||||
    print(sim_ten / 2 + 0.5, '\n')
 | 
				
			||||
    print(sim_ten.size())
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test2():
 | 
				
			||||
    multiprocessing.set_start_method("spawn")
 | 
				
			||||
    manager = multiprocessing.Manager()
 | 
				
			||||
    lock = manager.Lock()
 | 
				
			||||
    pool = multiprocessing.Pool(16)
 | 
				
			||||
    with manager:
 | 
				
			||||
        for _ in tqdm(range(0, 1000)):
 | 
				
			||||
            result = pool.apply_async(fuck, args=(_,))
 | 
				
			||||
            print(result)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test3():
 | 
				
			||||
    dic = {'description': 0, 'id': 1, 'manufacturer': 0, 'name': 0.9309734582901001, 'price': 0.912541675567627}
 | 
				
			||||
    ll = list(dic.values())
 | 
				
			||||
    ten = torch.Tensor(ll)
 | 
				
			||||
    t = ten.unsqueeze(1)
 | 
				
			||||
    t = t.unsqueeze(2)
 | 
				
			||||
    y = t.repeat(1, 742, 742)
 | 
				
			||||
    print(ten)
 | 
				
			||||
    print(y)
 | 
				
			||||
    print(torch.isfinite(ten))
 | 
				
			||||
    print(torch.count_nonzero(y).item())
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test4():
 | 
				
			||||
    one_bool_tensor = torch.ones((3, 3, 3), dtype=torch.bool)
 | 
				
			||||
    print(torch.count_nonzero(one_bool_tensor).item())
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test5():
 | 
				
			||||
    ten1 = torch.tensor([[1, 2, 3],
 | 
				
			||||
                         [7, 8, 9]])
 | 
				
			||||
    ten2 = torch.tensor([[4, 5, 6],
 | 
				
			||||
                         [11, 12, 15]])
 | 
				
			||||
    result = ten1 * ten2
 | 
				
			||||
    r = torch.sum(result, 1)
 | 
				
			||||
    print('\n')
 | 
				
			||||
    print(result)
 | 
				
			||||
    print(r)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test6():
 | 
				
			||||
    table_tensor = torch.tensor([[[1., 2., 3.],
 | 
				
			||||
                                  [4., 5., 6.],
 | 
				
			||||
                                  [7., 8., 9.]],
 | 
				
			||||
                                 [[1., 2., 3.],
 | 
				
			||||
                                  [4., 5., 6.],
 | 
				
			||||
                                  [7., 8., 9.]]])
 | 
				
			||||
    t = torch.tensor([[1., 2., 3.],
 | 
				
			||||
                      [4., 5., 6.]])
 | 
				
			||||
    norm1 = torch.nn.functional.normalize(table_tensor, dim=1)
 | 
				
			||||
    norm2 = torch.nn.functional.normalize(table_tensor, dim=2)
 | 
				
			||||
    print('\n')
 | 
				
			||||
    print(norm1)
 | 
				
			||||
    print(norm2)
 | 
				
			||||
    print(t.shape)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test7():
 | 
				
			||||
    iterations = 1
 | 
				
			||||
    filename_list = os.listdir(er_output_dir)
 | 
				
			||||
    if len(filename_list) > 0:
 | 
				
			||||
        for _ in filename_list:
 | 
				
			||||
            if _.startswith('eval_result'):
 | 
				
			||||
                iterations = int(_[12:13]) + 1
 | 
				
			||||
    print(iterations)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test8():
 | 
				
			||||
    with open(hpo_output_dir + "configspace.json", 'r') as load_f:
 | 
				
			||||
        dict_configspace = json.load(load_f)
 | 
				
			||||
    str_configspace = json.dumps(dict_configspace)
 | 
				
			||||
    configspace = csj.read(str_configspace)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test9():
 | 
				
			||||
    df = pd.read_json(r'./datasets/t.json', encoding='ISO-8859-1', lines=True)
 | 
				
			||||
    df.to_csv(r'./datasets/s.csv')
 | 
				
			||||
    d = pd.read_csv(r'./datasets/s.csv', encoding='ISO-8859-1')
 | 
				
			||||
    print(1)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test10():
 | 
				
			||||
    rtable = pd.read_csv(r'E:\Data\Research\Projects\matching_dependency\ml_er\output\predictions.csv',
 | 
				
			||||
                         encoding='ISO-8859-1')
 | 
				
			||||
    print(1)
 | 
				
			||||
    rtable.columns = ["id", "title", "authors", "venue", "year"]
 | 
				
			||||
    rtable.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\DBLP-GoogleScholar\tableB.csv',
 | 
				
			||||
                  sep=',', index=False, header=True, quoting=1)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test11():
 | 
				
			||||
    values = {
 | 
				
			||||
        'block_attr': 'class',
 | 
				
			||||
        'confidence_thresh': 0.2717823249253852,
 | 
				
			||||
        'ml_blocker': 'attr_equiv',
 | 
				
			||||
        'ml_matcher': 'ln',
 | 
				
			||||
        'similarity_thresh': 0.20681820299103484,
 | 
				
			||||
        'support_thresh': 129,
 | 
				
			||||
    }
 | 
				
			||||
    with open(hpo_output_dir + "incumbent.json", "w") as f:
 | 
				
			||||
        json.dump(values, f, indent=4)
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test12():
 | 
				
			||||
    with open(hpo_output_dir + "incumbent.json", 'r') as f:
 | 
				
			||||
        dic = json.load(f)
 | 
				
			||||
    for _ in dic.keys():
 | 
				
			||||
        print(f'Key:{_}\tValue:{dic[_]}\tType:{type(dic[_])}')
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
def test13():
 | 
				
			||||
    outcome_dir = r'E:\Data\Research\Outcome'
 | 
				
			||||
    configs_dir = r'\Magellan+Smac+roberta-large-nli-stsb-mean-tokens'
 | 
				
			||||
    datasets_list = os.listdir(outcome_dir)
 | 
				
			||||
    f = []
 | 
				
			||||
    for _ in datasets_list:
 | 
				
			||||
        f.append(outcome_dir + rf'\{_}' + configs_dir)
 | 
				
			||||
    print(f)
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue