parent
24985da169
commit
ea3b247da0
@ -0,0 +1,27 @@
|
|||||||
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
import sentence_transformers.util
|
||||||
|
import torch
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# model = SentenceTransformer('E:\\Data\\Research\\Models\\roberta-large-nli-stsb-mean-tokens')
|
||||||
|
# sentences = ['公积金转入深圳', '公积金转出深圳', None, None, 114514, 114514, 1919810]
|
||||||
|
# embedding = model.encode(sentences, device='cuda')
|
||||||
|
# outcome1 = sentence_transformers.util.cos_sim(embedding[4], embedding[5])
|
||||||
|
# outcome2 = sentence_transformers.util.cos_sim(embedding[4], embedding[6])
|
||||||
|
# print(outcome1.item())
|
||||||
|
# print(outcome2.item())
|
||||||
|
train = pd.read_csv(r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon\train.csv', encoding='ISO-8859-1')
|
||||||
|
valid = pd.read_csv(r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon\valid.csv', encoding='ISO-8859-1')
|
||||||
|
test = pd.read_csv(r'E:\Data\Research\Datasets\DeepMatcher dataset\Dirty\Walmart-Amazon\test.csv', encoding='ISO-8859-1')
|
||||||
|
train = train[train['label'] == 1]
|
||||||
|
valid = valid[valid['label'] == 1]
|
||||||
|
test = test[test['label'] == 1]
|
||||||
|
matches = pd.concat([train, valid, test])
|
||||||
|
matches.drop(columns=['label'], inplace=True)
|
||||||
|
matches = matches.sort_values(by='ltable_id')
|
||||||
|
matches.to_csv(r'E:\Data\Research\Projects\matching_dependency\datasets\Walmart-Amazon_dirty\matches.csv', sep=',', index=False, header=True)
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue