You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

232 lines
7.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import sys
sys.path.append(os.getcwd())
import time
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import faiss
import config
from tqdm import tqdm
import jieba
from ltp import LTP
from config import data_path, word2vec_dim
from config import ltp_model_path, shixuns_keywords_path
from config import JIEBA_TOKEN, LTP_TOKEN, logger, user_dict_path
from datetime import datetime
import torch
# 使用Faiss训练hnsw模型
tqdm.pandas()
ltp = LTP(ltp_model_path)
if torch.cuda.is_available():
ltp.to("cuda")
# 加载用户自定义词典
if os.path.exists(shixuns_keywords_path):
jieba.load_userdict(shixuns_keywords_path)
with open(shixuns_keywords_path, 'r', encoding='utf-8') as f:
user_dict_words = f.read().split()
ltp.add_words(user_dict_words)
if os.path.exists(user_dict_path):
with open(user_dict_path, 'r', encoding='utf-8') as f:
user_dict_words = f.read().split()
ltp.add_words(user_dict_words)
for word in user_dict_words:
jieba.add_word(word)
def tokenizer(sent, token_method=JIEBA_TOKEN, verbose=False):
"""
中文分词支持jieba和ltp两种方式
"""
if token_method == JIEBA_TOKEN:
seg = jieba.cut(sent)
result = ' '.join(seg)
elif token_method == LTP_TOKEN:
content = []
content.append(sent)
seg = ltp.pipeline(content, tasks=['cws'])['cws']
result = ''
for word in seg[0]:
if result == '':
result = word
else:
result = result + ' ' + word
if verbose == True:
logger.info(f"分词方式:{token_method}, 分词结果:{result}")
return result
def sentence_embedding(sentence, w2v_model, fassi_w2v_model, verbose=False):
'''
通过词向量均值的方式生成句向量
sentence: 待生成句向量的句子
w2v_model: word2vec模型
return: 句子中所有词向量的均值
'''
sentence = tokenizer(sentence, JIEBA_TOKEN,verbose)
embedding = []
for word in sentence.split():
if (word not in w2v_model.wv.index_to_key) and (word not in fassi_w2v_model.wv.index_to_key):
embedding.append(np.random.randn(1, word2vec_dim))
else:
if word in fassi_w2v_model.wv.index_to_key:
embedding.append(fassi_w2v_model.wv.get_vector(word))
else:
embedding.append(w2v_model.wv.get_vector(word))
# 所有词向量的均值为句向量
return np.mean(np.array(embedding), axis=0).reshape(1, -1)
class HNSW(object):
def __init__(self,
w2v_path, # 通用word2vec词向量
faiss_w2v_path, # 自己训练的word2vec词向量
ef=config.ef_construction, # 搜索时保存最近邻的动态列表大小
M=config.M, # 节点的邻结点的数量
model_path=None, # hnsw模型保存路径
data_path=None): # 数据文件路径
# 加载词向量
logger.info("加载Word2Vec词向量")
self.w2v_model = KeyedVectors.load(w2v_path)
self.fassi_w2v_model = KeyedVectors.load(faiss_w2v_path)
# 加载hnsw模型
if model_path and os.path.exists(model_path):
logger.info("加载HNSW快速召回模型")
self.data = pd.read_csv(data_path, sep='\t', encoding='utf-8')
self.index = self.load_hnsw(model_path)
# 训练hnsw模型
elif data_path:
logger.info("训练HNSW快速召回模型")
self.data = self.load_data(data_path)
self.index = self.build_hnsw(model_path, ef=ef, m=M)
else:
logger.error('No existing model and no building data provided.')
def load_data(self, data_path):
'''
读取数据,并生成句向量
:param data_path数据所在路径
:return: 包含句向量的dataframe
'''
data = pd.read_csv(data_path, sep='\t', encoding='utf-8')
logger.info('生成所有实训向量')
data['shixun_name_vec'] = data['shixun_name'].progress_apply(
lambda x: sentence_embedding(x, self.w2v_model, self.fassi_w2v_model))
logger.info('检测所有实训向量的维度')
data['shixun_name_vec'] = data['shixun_name_vec'].progress_apply(
lambda x: x[0][0] if x.shape[1] != word2vec_dim else x)
# 保存生成好的句向量
data['shixun_id'] = data['shixun_id'].astype(int)
return data
def evaluate(self, vecs):
'''
验证模型
'''
logger.info('Evaluating hnsw model')
nq, d = vecs.shape
t0 = time.time
# 找top1个相似的
D, I = self.index.search(vecs, 1)
t1 = time.time
missing_rate = (I == -1).sum() / float(nq)
recall_at_1 = (I == np.arange(nq)).sum() / float(nq)
print("\t %7.3f ms per query, R@1 %.4f, missing rate %.4f" % (
(t1 - t0) * 1000.0 / nq, recall_at_1, missing_rate))
def build_hnsw(self, to_file, ef=2000, m=64):
"""
训练hnsw模型
"""
logger.info('构建 HNSW 索引')
# 所有的句向量拼接
vecs = np.stack(self.data['shixun_name_vec'].values).reshape(-1, word2vec_dim)
vecs = vecs.astype('float32')
dim = self.w2v_model.vector_size
# 构建索引
index = faiss.IndexHNSWFlat(dim, m)
# 使用单个GPU资源
res = faiss.StandardGpuResources()
# res.setTempMemory(20 * 1024 * 1024 * 1024)
faiss.index_cpu_to_gpu(res, 0, index)
index.hnsw.ef_construction = ef
index.verbose = True
index.add(vecs)
# 保存hnsw模型
faiss.write_index(index, to_file)
return index
def load_hnsw(self, model_path):
hnsw = faiss.read_index(model_path)
return hnsw
def search(self, text, k=10):
"""
通过hnsw检索topk
"""
logger.info(f"Searching top {k} similarity for {text}.")
# 转换句向量
test_vec = sentence_embedding(text, self.w2v_model, self.fassi_w2v_model, verbose=True)
test_vec = test_vec.astype('float32')
# 搜索相似度最高的k个句向量
# 多召回一些避免通过status过滤后不足k个
D, I = self.index.search(test_vec, k * 2 )
top_k_index = list(I.ravel())
top_k_Item = []
for index in top_k_index:
shixun_id = int(self.data.iloc[index]['shixun_id'])
shixun_name = self.data.iloc[index]['shixun_name']
shixun_status = self.data.iloc[index]['status']
# 只取已发布的实训
if shixun_status == 2:
top_k_Item.append((shixun_id, shixun_name))
# 只返回topk个
return top_k_index[:k], top_k_Item[:k]
def hnsw_search_test(hnsw):
while True:
topk = 20
item_name = input('请输入您的实训名称:').strip()
top_k_index, top_k_Item = hnsw.search(item_name, k=topk)
df_top_k = pd.DataFrame(columns=['index', 'shixun_recommend'])
pd.set_option('colheader_justify', 'center')
for i in range(topk):
df_top_k.loc[i] = [top_k_index[i], top_k_Item[i]]
print(df_top_k)
if __name__ == '__main__':
hnsw = HNSW(config.word2vec_model_path,
config.shixun_faiss_w2v_path,
config.ef_construction,
config.M,
config.shixuns_fassi_model_path,
config.shixuns_data_path)