|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
import multiprocessing
|
|
|
|
|
from time import time
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from gensim.models import Word2Vec
|
|
|
|
|
from config import shixuns_keywords_path, ltp_model_path
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
import jieba
|
|
|
|
|
from ltp import LTP
|
|
|
|
|
import torch
|
|
|
|
|
from config import JIEBA_TOKEN, LTP_TOKEN, user_dict_path, logger
|
|
|
|
|
from config import shixuns_data_path, shixun_faiss_w2v_path
|
|
|
|
|
from config import word2vec_dim
|
|
|
|
|
|
|
|
|
|
# 训练召回用的word2vec词向量,实训包含字段:"shixun_name","language","subject_name"
|
|
|
|
|
|
|
|
|
|
tqdm.pandas()
|
|
|
|
|
|
|
|
|
|
ltp = LTP(ltp_model_path)
|
|
|
|
|
|
|
|
|
|
if torch.cuda.is_available():
|
|
|
|
|
ltp.to("cuda")
|
|
|
|
|
|
|
|
|
|
# 加载用户自定义词典
|
|
|
|
|
if os.path.exists(shixuns_keywords_path):
|
|
|
|
|
jieba.load_userdict(shixuns_keywords_path)
|
|
|
|
|
with open(shixuns_keywords_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
user_dict_words = f.read().split()
|
|
|
|
|
ltp.add_words(user_dict_words)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(user_dict_path):
|
|
|
|
|
with open(user_dict_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
user_dict_words = f.read().split()
|
|
|
|
|
ltp.add_words(user_dict_words)
|
|
|
|
|
for word in user_dict_words:
|
|
|
|
|
jieba.add_word(word)
|
|
|
|
|
|
|
|
|
|
def tokenizer(sent, token_method=JIEBA_TOKEN):
|
|
|
|
|
"""
|
|
|
|
|
中文分词,支持jieba和ltp两种方式
|
|
|
|
|
"""
|
|
|
|
|
if token_method == JIEBA_TOKEN:
|
|
|
|
|
seg = jieba.cut(sent)
|
|
|
|
|
result = ' '.join(seg)
|
|
|
|
|
elif token_method == LTP_TOKEN:
|
|
|
|
|
content = []
|
|
|
|
|
content.append(sent)
|
|
|
|
|
seg = ltp.pipeline(content, tasks=['cws'])['cws']
|
|
|
|
|
result = ''
|
|
|
|
|
for word in seg[0]:
|
|
|
|
|
if result == '':
|
|
|
|
|
result = word
|
|
|
|
|
else:
|
|
|
|
|
result = result + ' ' + word
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
def read_data(file_path):
|
|
|
|
|
"""
|
|
|
|
|
读取数据并分词
|
|
|
|
|
"""
|
|
|
|
|
logger.info("Loading train data")
|
|
|
|
|
train = pd.read_csv(file_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
logger.info("Starting tokenize...")
|
|
|
|
|
|
|
|
|
|
# 准备数据
|
|
|
|
|
shixun_name = train["shixun_name"]
|
|
|
|
|
language = train["language"]
|
|
|
|
|
subject_name = train["subject_name"]
|
|
|
|
|
|
|
|
|
|
#空值填充,否则连接三个文本有一方出现空值,整体为空
|
|
|
|
|
shixun_name.fillna(value="",inplace=True)
|
|
|
|
|
language.fillna(value="",inplace=True)
|
|
|
|
|
subject_name.fillna(value="",inplace=True)
|
|
|
|
|
|
|
|
|
|
shixun_text = shixun_name+language+subject_name
|
|
|
|
|
|
|
|
|
|
train['token_content'] = shixun_text.progress_apply(tokenizer)
|
|
|
|
|
return train
|
|
|
|
|
|
|
|
|
|
def train_w2v(train, to_file):
|
|
|
|
|
|
|
|
|
|
# 所有有句子
|
|
|
|
|
sentences = [row.split() for row in train['token_content']]
|
|
|
|
|
|
|
|
|
|
# cpu的核数
|
|
|
|
|
cores = multiprocessing.cpu_count()
|
|
|
|
|
|
|
|
|
|
w2v_model = Word2Vec(min_count=1, # min_count为1确保一些专业词不成为OOV词
|
|
|
|
|
window=5,
|
|
|
|
|
vector_size=word2vec_dim,
|
|
|
|
|
sample=6e-5,
|
|
|
|
|
alpha=0.03,
|
|
|
|
|
min_alpha=0.0007,
|
|
|
|
|
negative=15,
|
|
|
|
|
workers=cores//2,
|
|
|
|
|
epochs=20,
|
|
|
|
|
hs=1)
|
|
|
|
|
|
|
|
|
|
t = time()
|
|
|
|
|
w2v_model.build_vocab(sentences)
|
|
|
|
|
logger.info('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
|
|
|
|
|
|
|
|
|
|
t = time()
|
|
|
|
|
w2v_model.train(sentences,
|
|
|
|
|
total_examples=w2v_model.corpus_count,
|
|
|
|
|
epochs=30,
|
|
|
|
|
report_delay=1)
|
|
|
|
|
logger.info('Time to train word2vec: {} mins'.format(round((time() - t) / 60, 2)))
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(os.path.dirname(to_file)):
|
|
|
|
|
os.mkdir(os.path.dirname(to_file))
|
|
|
|
|
|
|
|
|
|
w2v_model.save(to_file)
|
|
|
|
|
|
|
|
|
|
logger.info('train word2vec finished.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
train = read_data(shixuns_data_path)
|
|
|
|
|
train_w2v(train, shixun_faiss_w2v_path)
|