You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
2.5 KiB
68 lines
2.5 KiB
from tqdm import tqdm
|
|
from ltp import LTP
|
|
from gensim.models import KeyedVectors
|
|
import os
|
|
import sys
|
|
sys.path.append(os.getcwd())
|
|
from config import subjects_data_path,subjects_bert_em_path
|
|
from utils import finalcut
|
|
import pandas as pd
|
|
import re
|
|
import logging
|
|
from transformers import AutoTokenizer, TFAutoModel
|
|
from config import bert_base_chinese
|
|
|
|
|
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
|
tqdm.pandas()
|
|
|
|
# 加载大规模预训练Bert模型
|
|
bert_model = bert_base_chinese
|
|
tokenizer = AutoTokenizer.from_pretrained(bert_model)
|
|
model = TFAutoModel.from_pretrained(bert_model,output_hidden_states=True) # 模型是否返回所有隐藏状态。
|
|
|
|
subject = pd.read_csv(subjects_data_path,sep='\t',encoding='utf-8')
|
|
|
|
subject = subject.drop(['disciplines_id', 'disciplines_name', 'sub_discipline_id', 'status',
|
|
'updated_at', 'publish_time', 'homepage_show','repertoire_id', 'score_count',
|
|
'initiative_study', 'course_used_count','school_used_count','initiative_school_used_count',
|
|
'initiative_passed_count','initiative_challenge_count','initiative_evaluate_count',
|
|
'video_study_time','initiative_video_study_time','initiative_study_pdf_attachment_count','created_at_ts'], axis=1)
|
|
#添加bert_em列
|
|
for i in tqdm(range(768)):
|
|
bert_em = "bert_em" + str(i)
|
|
subject[bert_em] = 0.
|
|
|
|
# 准备数据
|
|
subject_name = subject["subject_name"]
|
|
sub_dis_name = subject["sub_discipline_name"]
|
|
tags_name = subject["tag_names"]
|
|
|
|
subject_name.fillna(value="",inplace=True)
|
|
sub_dis_name.fillna(value="",inplace=True)
|
|
tags_name.fillna(value="",inplace=True)
|
|
|
|
subject_text = subject_name+sub_dis_name+tags_name
|
|
|
|
words = []
|
|
for i in tqdm(range(len(subject_text))):
|
|
words.append(finalcut((subject_text[i]))) #删除句子中无效字符
|
|
|
|
def getbert_vec(word_list):
|
|
# 输入测试句子
|
|
inputs = tokenizer(word_list, return_tensors="tf", padding="max_length", truncation=True, max_length=64)
|
|
outputs = model(inputs)
|
|
hidden_states = outputs[1] # 获得句子向量
|
|
return list(hidden_states.numpy()[0])
|
|
|
|
|
|
words_list = []
|
|
for i in tqdm(range(len(words))):
|
|
words_list.append(getbert_vec(words[i]))#获得句子的bert embedding向量
|
|
|
|
for i in tqdm(range(len(words_list))):
|
|
for j in range(len(words_list[i])):
|
|
column = "bert_em" + str(j)
|
|
subject.loc[i, column] = words_list[i][j]
|
|
subject = subject.drop(["subject_name","sub_discipline_name","tag_names"], axis=1)
|
|
subject.to_csv(subjects_bert_em_path,sep='\t', index=False, header=True) |