EduCoder_Study_RS/matching/shixun/faiss_word2vec.py

import os
import sys
sys.path.append(os.getcwd())
import multiprocessing
from time import time
import pandas as pd
from gensim.models import Word2Vec
from config import shixuns_keywords_path, ltp_model_path
from tqdm import tqdm
import jieba
from ltp import LTP
import torch
from config import JIEBA_TOKEN, LTP_TOKEN, user_dict_path, logger
from config import shixuns_data_path, shixun_faiss_w2v_path
from config import word2vec_dim

#  训练召回用的word2vec词向量,实训包含字段："shixun_name"，"language"，"subject_name"

tqdm.pandas()

ltp = LTP(ltp_model_path)

if torch.cuda.is_available():
    ltp.to("cuda")

# 加载用户自定义词典
if os.path.exists(shixuns_keywords_path):
    jieba.load_userdict(shixuns_keywords_path) 
    with open(shixuns_keywords_path, 'r', encoding='utf-8') as f:
        user_dict_words = f.read().split()
        ltp.add_words(user_dict_words) 

if os.path.exists(user_dict_path):
    with open(user_dict_path, 'r', encoding='utf-8') as f:
        user_dict_words = f.read().split()
        ltp.add_words(user_dict_words)
        for word in user_dict_words:
            jieba.add_word(word)

def tokenizer(sent, token_method=JIEBA_TOKEN):
    """
    中文分词，支持jieba和ltp两种方式
    """   
    if token_method == JIEBA_TOKEN:    
        seg = jieba.cut(sent)
        result = ' '.join(seg)
    elif token_method == LTP_TOKEN:
        content = []
        content.append(sent)
        seg = ltp.pipeline(content, tasks=['cws'])['cws']
        result = ''
        for word in seg[0]:
            if result == '':
                result = word
            else:
                result = result + ' ' + word     
    return result

def read_data(file_path):
    """
    读取数据并分词
    """
    logger.info("Loading train data")
    train = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    logger.info("Starting tokenize...")
    
    # 准备数据
    shixun_name = train["shixun_name"]
    language = train["language"]
    subject_name = train["subject_name"]
    
    #空值填充，否则连接三个文本有一方出现空值，整体为空
    shixun_name.fillna(value="",inplace=True)
    language.fillna(value="",inplace=True)
    subject_name.fillna(value="",inplace=True)
    
    shixun_text = shixun_name+language+subject_name

    train['token_content'] = shixun_text.progress_apply(tokenizer)
    return train

def train_w2v(train, to_file):

    # 所有有句子
    sentences = [row.split() for row in train['token_content']]

    # cpu的核数
    cores = multiprocessing.cpu_count()

    w2v_model = Word2Vec(min_count=1,       # min_count为1确保一些专业词不成为OOV词
                         window=5,
                         vector_size=word2vec_dim,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=15,
                         workers=cores//2,
                         epochs=20,
                         hs=1)

    t = time()
    w2v_model.build_vocab(sentences)
    logger.info('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

    t = time()
    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=30,
                    report_delay=1)
    logger.info('Time to train word2vec: {} mins'.format(round((time() - t) / 60, 2)))

    if not os.path.exists(os.path.dirname(to_file)):
        os.mkdir(os.path.dirname(to_file))

    w2v_model.save(to_file)

    logger.info('train word2vec finished.')
    

if __name__ == "__main__":
    train = read_data(shixuns_data_path)
    train_w2v(train, shixun_faiss_w2v_path)