You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.5 KiB

from tqdm import tqdm
from ltp import LTP
from gensim.models import KeyedVectors
import os
import sys
sys.path.append(os.getcwd())
from config import subjects_data_path,subjects_bert_em_path
from utils import finalcut
import pandas as pd
import re
import logging
from transformers import AutoTokenizer, TFAutoModel
from config import bert_base_chinese
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tqdm.pandas()
# 加载大规模预训练Bert模型
bert_model = bert_base_chinese
tokenizer = AutoTokenizer.from_pretrained(bert_model)
model = TFAutoModel.from_pretrained(bert_model,output_hidden_states=True) # 模型是否返回所有隐藏状态。
subject = pd.read_csv(subjects_data_path,sep='\t',encoding='utf-8')
subject = subject.drop(['disciplines_id', 'disciplines_name', 'sub_discipline_id', 'status',
'updated_at', 'publish_time', 'homepage_show','repertoire_id', 'score_count',
'initiative_study', 'course_used_count','school_used_count','initiative_school_used_count',
'initiative_passed_count','initiative_challenge_count','initiative_evaluate_count',
'video_study_time','initiative_video_study_time','initiative_study_pdf_attachment_count','created_at_ts'], axis=1)
#添加bert_em列
for i in tqdm(range(768)):
bert_em = "bert_em" + str(i)
subject[bert_em] = 0.
# 准备数据
subject_name = subject["subject_name"]
sub_dis_name = subject["sub_discipline_name"]
tags_name = subject["tag_names"]
subject_name.fillna(value="",inplace=True)
sub_dis_name.fillna(value="",inplace=True)
tags_name.fillna(value="",inplace=True)
subject_text = subject_name+sub_dis_name+tags_name
words = []
for i in tqdm(range(len(subject_text))):
words.append(finalcut((subject_text[i]))) #删除句子中无效字符
def getbert_vec(word_list):
# 输入测试句子
inputs = tokenizer(word_list, return_tensors="tf", padding="max_length", truncation=True, max_length=64)
outputs = model(inputs)
hidden_states = outputs[1] # 获得句子向量
return list(hidden_states.numpy()[0])
words_list = []
for i in tqdm(range(len(words))):
words_list.append(getbert_vec(words[i]))#获得句子的bert embedding向量
for i in tqdm(range(len(words_list))):
for j in range(len(words_list[i])):
column = "bert_em" + str(j)
subject.loc[i, column] = words_list[i][j]
subject = subject.drop(["subject_name","sub_discipline_name","tag_names"], axis=1)
subject.to_csv(subjects_bert_em_path,sep='\t', index=False, header=True)