EduCoder_Study_RS/ranking/subject/bert_embedding.py

from tqdm import tqdm
from ltp import LTP
from gensim.models import KeyedVectors
import os
import sys
sys.path.append(os.getcwd())
from config import subjects_data_path,subjects_bert_em_path
from utils import finalcut
import pandas as pd
import re
import logging
from transformers import AutoTokenizer, TFAutoModel
from config import bert_base_chinese


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tqdm.pandas()

# 加载大规模预训练Bert模型
bert_model = bert_base_chinese
tokenizer = AutoTokenizer.from_pretrained(bert_model)
model = TFAutoModel.from_pretrained(bert_model,output_hidden_states=True)  # 模型是否返回所有隐藏状态。

subject = pd.read_csv(subjects_data_path,sep='\t',encoding='utf-8')

subject = subject.drop(['disciplines_id', 'disciplines_name', 'sub_discipline_id', 'status',
                       'updated_at', 'publish_time', 'homepage_show','repertoire_id', 'score_count',
                       'initiative_study', 'course_used_count','school_used_count','initiative_school_used_count',
                       'initiative_passed_count','initiative_challenge_count','initiative_evaluate_count',
                       'video_study_time','initiative_video_study_time','initiative_study_pdf_attachment_count','created_at_ts'], axis=1)
#添加bert_em列
for i in tqdm(range(768)):
    bert_em = "bert_em" + str(i)
    subject[bert_em] = 0.

# 准备数据
subject_name = subject["subject_name"]
sub_dis_name = subject["sub_discipline_name"]
tags_name = subject["tag_names"]

subject_name.fillna(value="",inplace=True)
sub_dis_name.fillna(value="",inplace=True)
tags_name.fillna(value="",inplace=True)

subject_text = subject_name+sub_dis_name+tags_name

words = []
for i in tqdm(range(len(subject_text))):
    words.append(finalcut((subject_text[i]))) #删除句子中无效字符

def getbert_vec(word_list):
    # 输入测试句子
    inputs = tokenizer(word_list, return_tensors="tf", padding="max_length", truncation=True, max_length=64)
    outputs = model(inputs)
    hidden_states = outputs[1]  # 获得句子向量
    return list(hidden_states.numpy()[0])


words_list = []
for i in tqdm(range(len(words))):
    words_list.append(getbert_vec(words[i]))#获得句子的bert embedding向量

for i in tqdm(range(len(words_list))):
    for j in range(len(words_list[i])):
        column = "bert_em" + str(j)
        subject.loc[i, column] = words_list[i][j]
subject = subject.drop(["subject_name","sub_discipline_name","tag_names"], axis=1)
subject.to_csv(subjects_bert_em_path,sep='\t', index=False, header=True)
first commit 5 months ago			`from tqdm import tqdm`
			`from ltp import LTP`
			`from gensim.models import KeyedVectors`
			`import os`
			`import sys`
			`sys.path.append(os.getcwd())`
			`from config import subjects_data_path,subjects_bert_em_path`
			`from utils import finalcut`
			`import pandas as pd`
			`import re`
			`import logging`
			`from transformers import AutoTokenizer, TFAutoModel`
			`from config import bert_base_chinese`


			`logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)`
			`tqdm.pandas()`

			`# 加载大规模预训练Bert模型`
			`bert_model = bert_base_chinese`
			`tokenizer = AutoTokenizer.from_pretrained(bert_model)`
			`model = TFAutoModel.from_pretrained(bert_model,output_hidden_states=True) # 模型是否返回所有隐藏状态。`

			`subject = pd.read_csv(subjects_data_path,sep='\t',encoding='utf-8')`

			`subject = subject.drop(['disciplines_id', 'disciplines_name', 'sub_discipline_id', 'status',`
			`'updated_at', 'publish_time', 'homepage_show','repertoire_id', 'score_count',`
			`'initiative_study', 'course_used_count','school_used_count','initiative_school_used_count',`
			`'initiative_passed_count','initiative_challenge_count','initiative_evaluate_count',`
			`'video_study_time','initiative_video_study_time','initiative_study_pdf_attachment_count','created_at_ts'], axis=1)`
			`#添加bert_em列`
			`for i in tqdm(range(768)):`
			`bert_em = "bert_em" + str(i)`
			`subject[bert_em] = 0.`

			`# 准备数据`
			`subject_name = subject["subject_name"]`
			`sub_dis_name = subject["sub_discipline_name"]`
			`tags_name = subject["tag_names"]`

			`subject_name.fillna(value="",inplace=True)`
			`sub_dis_name.fillna(value="",inplace=True)`
			`tags_name.fillna(value="",inplace=True)`

			`subject_text = subject_name+sub_dis_name+tags_name`

			`words = []`
			`for i in tqdm(range(len(subject_text))):`
			`words.append(finalcut((subject_text[i]))) #删除句子中无效字符`

			`def getbert_vec(word_list):`
			`# 输入测试句子`
			`inputs = tokenizer(word_list, return_tensors="tf", padding="max_length", truncation=True, max_length=64)`
			`outputs = model(inputs)`
			`hidden_states = outputs[1] # 获得句子向量`
			`return list(hidden_states.numpy()[0])`


			`words_list = []`
			`for i in tqdm(range(len(words))):`
			`words_list.append(getbert_vec(words[i]))#获得句子的bert embedding向量`

			`for i in tqdm(range(len(words_list))):`
			`for j in range(len(words_list[i])):`
			`column = "bert_em" + str(j)`
			`subject.loc[i, column] = words_list[i][j]`
			`subject = subject.drop(["subject_name","sub_discipline_name","tag_names"], axis=1)`
			`subject.to_csv(subjects_bert_em_path,sep='\t', index=False, header=True)`