import os import sys sys.path.append(os.getcwd()) import pandas as pd import numpy as np from tqdm import tqdm from ltp import LTP import jieba import csv import torch from gensim.models import KeyedVectors from config import subjects_data_path, logger, user_dict_path from config import subjects_keywords_path from config import JIEBA_TOKEN, LTP_TOKEN, word2vec_dim from config import ltp_model_path from config import word2vec_model_path, subject_faiss_w2v_path, data_parent_path #生成句向量,实践课程包含字段:"subject_name","sub_discipline_name","tag_names" tqdm.pandas() ltp = LTP(ltp_model_path) if torch.cuda.is_available(): ltp.to("cuda") logger.info("加载Word2Vec词向量") w2v_model = KeyedVectors.load(word2vec_model_path) fassi_w2v_model = KeyedVectors.load(subject_faiss_w2v_path) # 加载用户自定义词典 if os.path.exists(subjects_keywords_path): jieba.load_userdict(subjects_keywords_path) with open(subjects_keywords_path, 'r', encoding='utf-8') as f: user_dict_words = f.read().split() ltp.add_words(user_dict_words) if os.path.exists(user_dict_path): with open(user_dict_path, 'r', encoding='utf-8') as f: user_dict_words = f.read().split() ltp.add_words(user_dict_words) for word in user_dict_words: jieba.add_word(word) def tokenizer(sent, token_method=JIEBA_TOKEN, verbose=False): """ 中文分词,支持jieba和ltp两种方式 """ if token_method == JIEBA_TOKEN: seg = jieba.cut(sent) result = ' '.join(seg) elif token_method == LTP_TOKEN: content = [] content.append(sent) seg = ltp.pipeline(content, tasks=['cws'])['cws'] result = '' for word in seg[0]: if result == '': result = word else: result = result + ' ' + word if verbose == True: logger.info(f"分词方式:{token_method}, 分词结果:{result}") return result def sentence_embedding(sentence, w2v_model, fassi_w2v_model, verbose=False): ''' 通过词向量均值的方式生成句向量 sentence: 待生成句向量的句子 w2v_model: word2vec模型 return: 句子中所有词向量的均值 ''' sentence = tokenizer(sentence, JIEBA_TOKEN,verbose) embedding = [] for word in sentence.split(): if (word not in w2v_model.wv.index_to_key) and (word not in fassi_w2v_model.wv.index_to_key): embedding.append(np.random.randn(1, word2vec_dim)) else: if word in fassi_w2v_model.wv.index_to_key: embedding.append(fassi_w2v_model.wv.get_vector(word)) else: embedding.append(w2v_model.wv.get_vector(word)) # 所有词向量的均值为句向量 return np.mean(np.array(embedding), axis=0).reshape(1, -1) def build_subjects_embedding(): ''' 生成所有课程的句向量 ''' data = pd.read_csv(subjects_data_path, sep='\t', encoding='utf-8') # 准备数据 subject_name = data["subject_name"] sub_dis_name = data["sub_discipline_name"] tags_name = data["tag_names"] subject_name.fillna(value="",inplace=True) sub_dis_name.fillna(value="",inplace=True) tags_name.fillna(value="",inplace=True) subject_text = subject_name+sub_dis_name+tags_name logger.info('生成所有课程向量') data['subject_name_vec'] = subject_text.progress_apply( lambda x: sentence_embedding(x, w2v_model, fassi_w2v_model)) logger.info('检测所有课程向量的维度') data['subject_name_vec'] = data['subject_name_vec'].progress_apply( lambda x: x[0][0] if x.shape[1] != word2vec_dim else x) data['subject_id'] = data['subject_id'].astype(int) save_embedding_data(data) def save_embedding_data(datas): """ 保存embedding """ logger.info("保存生成的课程向量") subjects_emb_data = open(data_parent_path + 'subjects_emb.csv', 'w', encoding='utf-8', newline="") csv_out = csv.writer(subjects_emb_data, delimiter='\t') # 先写入字段名 headers = ['subject_id'] subject_name_vec_100s = [("emb_" + str(i)) for i in range(100)] for i in subject_name_vec_100s: headers.append(i) csv_out.writerow(headers) # 再写入每行数据 for index, row in datas.iterrows(): subject_name_vec = row['subject_name_vec'] subject_name_vecs = subject_name_vec[0] row_data = [row['subject_id']] for i in range(len(subject_name_vecs)): row_data.append(subject_name_vecs[i]) csv_out.writerow(row_data) if __name__ == '__main__': build_subjects_embedding()