import pandas as pd import numpy as np from tqdm import tqdm from ltp import LTP import jieba import csv from gensim.models import KeyedVectors import torch import os import sys sys.path.append(os.getcwd()) from config import shixuns_data_path, logger, user_dict_path from config import ltp_model_path, shixuns_keywords_path from config import JIEBA_TOKEN, LTP_TOKEN, word2vec_dim from config import word2vec_model_path, shixun_faiss_w2v_path, data_parent_path tqdm.pandas() ltp = LTP(ltp_model_path) if torch.cuda.is_available(): ltp.to("cuda") logger.info("加载Word2Vec词向量") w2v_model = KeyedVectors.load(word2vec_model_path) fassi_w2v_model = KeyedVectors.load(shixun_faiss_w2v_path) # 加载用户自定义词典 if os.path.exists(shixuns_keywords_path): jieba.load_userdict(shixuns_keywords_path) with open(shixuns_keywords_path, 'r', encoding='utf-8') as f: user_dict_words = f.read().split() ltp.add_words(user_dict_words) if os.path.exists(user_dict_path): with open(user_dict_path, 'r', encoding='utf-8') as f: user_dict_words = f.read().split() ltp.add_words(user_dict_words) for word in user_dict_words: jieba.add_word(word) def tokenizer(sent, token_method=JIEBA_TOKEN, verbose=False): """ 中文分词,支持jieba和ltp两种方式 """ if token_method == JIEBA_TOKEN: seg = jieba.cut(sent) result = ' '.join(seg) elif token_method == LTP_TOKEN: content = [] content.append(sent) seg = ltp.pipeline(content, tasks=['cws'])['cws'] result = '' for word in seg[0]: if result == '': result = word else: result = result + ' ' + word if verbose == True: logger.info(f"分词方式:{token_method}, 分词结果:{result}") return result def sentence_embedding(sentence, w2v_model, fassi_w2v_model, verbose=False): ''' 通过词向量均值的方式生成句向量 sentence: 待生成句向量的句子 w2v_model: word2vec模型 return: 句子中所有词向量的均值 ''' # sentence = re.sub(r"[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+", # "", sentence) sentence = tokenizer(sentence, JIEBA_TOKEN,verbose) embedding = [] for word in sentence.split(): if (word not in w2v_model.wv.index_to_key) and (word not in fassi_w2v_model.wv.index_to_key): embedding.append(np.random.randn(1, word2vec_dim)) else: if word in fassi_w2v_model.wv.index_to_key: embedding.append(fassi_w2v_model.wv.get_vector(word)) else: embedding.append(w2v_model.wv.get_vector(word)) # 所有词向量的均值为句向量 return np.mean(np.array(embedding), axis=0).reshape(1, -1) def save_embedding_data(datas): """ 保存embedding """ logger.info("保存生成的实训向量") shixuns_emb_data = open(data_parent_path + 'shixuns_emb.csv', 'w', encoding='utf-8', newline="") csv_out = csv.writer(shixuns_emb_data, delimiter='\t') # 先写入字段名 headers = ['shixun_id'] shixun_name_vec_100s = [("emb_" + str(i)) for i in range(100)] for i in shixun_name_vec_100s: headers.append(i) csv_out.writerow(headers) # 再写入每行数据 for index, row in datas.iterrows(): shixun_name_vec = row['shixun_name_vec'] shixun_name_vecs = shixun_name_vec[0] row_data = [row['shixun_id']] for i in range(len(shixun_name_vecs)): row_data.append(shixun_name_vecs[i]) csv_out.writerow(row_data) def build_shixuns_embedding(): ''' 生成所有实训的句向量 ''' data = pd.read_csv(shixuns_data_path, sep='\t', encoding='utf-8') # 准备数据 shixun_name = data["shixun_name"] language = data["language"] subject_name = data["subject_name"] #空值填充,否则连接三个文本有一方出现空值,整体为空 shixun_name.fillna(value="",inplace=True) language.fillna(value="",inplace=True) subject_name.fillna(value="",inplace=True) shixun_text = shixun_name+language+subject_name logger.info('生成所有实训向量') data['shixun_name_vec'] = shixun_text.progress_apply(lambda x: sentence_embedding(x, w2v_model, fassi_w2v_model)) logger.info('检测所有实训向量的维度') data['shixun_name_vec'] = data['shixun_name_vec'].progress_apply(lambda x: x[0][0] if x.shape[1] != word2vec_dim else x) data['shixun_id'] = data['shixun_id'].astype(int) save_embedding_data(data) if __name__ == '__main__': build_shixuns_embedding()