import os import sys sys.path.append(os.getcwd()) import time import numpy as np import pandas as pd from gensim.models import KeyedVectors import faiss import config from tqdm import tqdm import jieba from ltp import LTP import torch from config import data_path, word2vec_dim from config import ltp_model_path, subjects_keywords_path from config import JIEBA_TOKEN, LTP_TOKEN, logger, user_dict_path # 使用Faiss训练hnsw模型 tqdm.pandas() ltp = LTP(ltp_model_path) if torch.cuda.is_available(): ltp.to("cuda") # 加载用户自定义词典 if os.path.exists(subjects_keywords_path): jieba.load_userdict(subjects_keywords_path) with open(subjects_keywords_path, 'r', encoding='utf-8') as f: user_dict_words = f.read().split() ltp.add_words(user_dict_words) if os.path.exists(user_dict_path): with open(user_dict_path, 'r', encoding='utf-8') as f: user_dict_words = f.read().split() ltp.add_words(user_dict_words) for word in user_dict_words: jieba.add_word(word) def tokenizer(sent, token_method=JIEBA_TOKEN, verbose=False): """ 中文分词,支持jieba和ltp两种方式 """ if token_method == JIEBA_TOKEN: seg = jieba.cut(sent) result = ' '.join(seg) elif token_method == LTP_TOKEN: content = [] content.append(sent) seg = ltp.pipeline(content, tasks=['cws'])['cws'] result = '' for word in seg[0]: if result == '': result = word else: result = result + ' ' + word if verbose == True: logger.info(f"分词方式:{token_method}, 分词结果:{result}") return result def sentence_embedding(sentence, w2v_model, fassi_w2v_model, verbose=False): ''' 通过词向量均值的方式生成句向量 sentence: 待生成句向量的句子 w2v_model: word2vec模型 return: 句子中所有词向量的均值 ''' # sentence = re.sub(r"[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+", # "", sentence) sentence = tokenizer(sentence, JIEBA_TOKEN,verbose) embedding = [] for word in sentence.split(): if (word not in w2v_model.wv.index_to_key) and (word not in fassi_w2v_model.wv.index_to_key): embedding.append(np.random.randn(1, word2vec_dim)) else: if word in fassi_w2v_model.wv.index_to_key: embedding.append(fassi_w2v_model.wv.get_vector(word)) else: embedding.append(w2v_model.wv.get_vector(word)) # 所有词向量的均值为句向量 return np.mean(np.array(embedding), axis=0).reshape(1, -1) class HNSW(object): def __init__(self, w2v_path, # 通用word2vec词向量 faiss_w2v_path, # 自己训练的word2vec词向量 ef=config.ef_construction, # 搜索时保存最近邻的动态列表大小 M=config.M, # 节点的邻结点的数量 model_path=None, # hnsw模型保存路径 data_path=None): # 数据文件路径 # 加载词向量 logger.info("加载Word2Vec词向量") self.w2v_model = KeyedVectors.load(w2v_path) self.fassi_w2v_model = KeyedVectors.load(faiss_w2v_path) # 加载hnsw模型 if model_path and os.path.exists(model_path): logger.info("加载HNSW快速召回模型") self.data = pd.read_csv(data_path, sep='\t', encoding='utf-8') self.index = self.load_hnsw(model_path) # 训练hnsw模型 elif data_path: logger.info("训练HNSW快速召回模型") self.data = self.load_data(data_path) self.index = self.build_hnsw(model_path, ef=ef, m=M) else: logger.error('No existing model and no building data provided.') def load_data(self, data_path): ''' 读取数据,并生成句向量 :param data_path:数据所在路径 :return: 包含句向量的dataframe ''' data = pd.read_csv(data_path, sep='\t', encoding='utf-8') logger.info('生成所有课程向量') data['subject_name_vec'] = data['subject_name'].progress_apply( lambda x: sentence_embedding(x, self.w2v_model, self.fassi_w2v_model)) logger.info('检测所有课程向量的维度') data['subject_name_vec'] = data['subject_name_vec'].progress_apply( lambda x: x[0][0] if x.shape[1] != word2vec_dim else x) # 保存生成好的句向量 data['subject_id'] = data['subject_id'].astype(int) return data def evaluate(self, vecs): ''' 验证模型 ''' logger.info('Evaluating hnsw model') nq, d = vecs.shape t0 = time.time # 找top1个相似的 D, I = self.index.search(vecs, 1) t1 = time.time missing_rate = (I == -1).sum() / float(nq) recall_at_1 = (I == np.arange(nq)).sum() / float(nq) print("\t %7.3f ms per query, R@1 %.4f, missing rate %.4f" % ( (t1 - t0) * 1000.0 / nq, recall_at_1, missing_rate)) def build_hnsw(self, to_file, ef=2000, m=64): """ 训练hnsw模型 """ logger.info('构建 HNSW 索引') # 所有的句向量拼接 vecs = np.stack(self.data['subject_name_vec'].values).reshape(-1, word2vec_dim) vecs = vecs.astype('float32') dim = self.w2v_model.vector_size # 构建索引 index = faiss.IndexHNSWFlat(dim, m) # 使用单个GPU资源 res = faiss.StandardGpuResources() faiss.index_cpu_to_gpu(res, 0, index) index.hnsw.ef_construction = ef index.verbose = True index.add(vecs) # 保存hnsw模型 faiss.write_index(index, to_file) return index def load_hnsw(self, model_path): hnsw = faiss.read_index(model_path) return hnsw def search(self, text, k=100): """ 通过hnsw检索topk """ logger.info(f"Searching top {k} similarity for {text}.") # 转换句向量 test_vec = sentence_embedding(text, self.w2v_model, self.fassi_w2v_model, verbose=True) test_vec = test_vec.astype('float32') # 搜索相似度最高的k个句向量 # 多召回一些避免通过status过滤后不足k个 D, I = self.index.search(test_vec, k * 2 ) top_k_index = list(I.ravel()) top_k_Item = [] for index in top_k_index: subject_id = int(self.data.iloc[index]['subject_id']) subject_name = self.data.iloc[index]['subject_name'] top_k_Item.append((subject_id, subject_name)) # 只返回topk个 return top_k_index[:k], top_k_Item[:k] def hnsw_search_test(hnsw): while True: topk = 100 item_name = input('请输入您的课程名称:').strip() top_k_index, top_k_Item = hnsw.search(item_name, k=topk) df_top_k = pd.DataFrame(columns=['index', 'subject_recommend']) pd.set_option('colheader_justify', 'center') for i in range(topk): df_top_k.loc[i] = [top_k_index[i], top_k_Item[i]] print(df_top_k) if __name__ == '__main__': hnsw = HNSW(config.word2vec_model_path, config.subject_faiss_w2v_path, config.ef_construction, config.M, config.subjects_fassi_model_path, config.subjects_data_path)