import os import sys sys.path.append(os.getcwd()) import numpy as np from tqdm import tqdm import faiss import warnings import pickle import collections import random from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import LabelEncoder import tensorflow as tf from tensorflow.python.keras import backend as K from tensorflow.python.keras.models import Model from tensorflow.keras.preprocessing.sequence import pad_sequences from deepmatch.models import MIND from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat from collections import Counter from config import logger, offline_mode from config import need_metric_recall from config import embedding_dim, ef_construction, M from config import subject_mind_recall0_dict,subject_mind_recall1_dict from config import subject_mind_user_emb0_dict,subject_mind_user_emb1_dict from config import subject_mind_item_emb_dict from config import subject_mind_item_faiss_model_path from config import subject_mind_user0_faiss_model_path,subject_mind_user1_faiss_model_path from config import subject_mind_user_embedding0_index_dict,subject_mind_user_embedding1_index_dict from config import subject_mind_item_embedding_index_dict from config import subject_mind_train_input_data from config import subject_mind_train_label_data from config import subject_mind_test_input_data from config import subject_mind_test_label_data from config import subject_mind_train_set_data from config import subject_mind_test_set_data from config import subject_mind_user_embedding0_data,subject_mind_user_embedding1_data from config import subject_mind_item_embedding_data from config import samples_mode from matching.subject.recall_comm import get_all_select_df from matching.subject.recall_comm import get_user_info_df,get_item_info_df from matching.subject.recall_comm import metrics_recall,get_all_hist_and_last_select K.set_learning_phase(True) if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() tqdm.pandas() warnings.filterwarnings('ignore') def gen_data_set(data, negsample=0): """ 获取MIND召回时的训练验证数据 negsample指的是通过滑窗构建样本的时候,负采集样样本的数量 """ data.sort_values("created_timestamp", inplace=True) item_ids = data['subject_id'].unique() train_set = [] test_set = [] if os.path.exists(subject_mind_train_set_data) and \ os.path.exists(subject_mind_test_set_data): train_set = pickle.load(open(subject_mind_train_set_data, 'rb')) test_set = pickle.load(open(subject_mind_test_set_data, 'rb')) return train_set, test_set for reviewerID, hist_select in tqdm(data.groupby('user_id')): pos_list = hist_select['subject_id'].tolist() if negsample > 0: # 用户没选择的物品里面选择负样本 candidate_set = list(set(item_ids) - set(pos_list)) # 对于每个正样本,选择n个负样本 neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True) # 长度只有一个的时候,需要把这条数据也放到训练集中,不然最终学到的embedding就会有缺失 if len(pos_list) == 1: train_set.append((reviewerID, [pos_list[0]], pos_list[0], 1, len(pos_list), hist_select.iloc[0]['gender'], hist_select.iloc[0]['identity'], hist_select.iloc[0]['edu_background'],hist_select.iloc[0]['logins'], hist_select.iloc[0]['grade'],hist_select.iloc[0]['experience'])) test_set.append((reviewerID, [pos_list[0]], pos_list[0], 1, len(pos_list), hist_select.iloc[0]['gender'], hist_select.iloc[0]['identity'], hist_select.iloc[0]['edu_background'],hist_select.iloc[0]['logins'], hist_select.iloc[0]['grade'],hist_select.iloc[0]['experience'])) # 滑窗构造正负样本 for i in range(1, len(pos_list)): hist_sel = pos_list[:i] if i != len(pos_list) - 1: # 正样本 [user_id, pos_item, label, len(his_item)...] train_set.append((reviewerID, hist_sel[::-1], pos_list[i], 1, len(hist_sel[::-1]), hist_select.iloc[0]['gender'], hist_select.iloc[0]['identity'], hist_select.iloc[0]['edu_background'],hist_select.iloc[0]['logins'], hist_select.iloc[0]['grade'],hist_select.iloc[0]['experience'])) # logger.info("滑窗构造负样本") for negi in range(negsample): # 负样本 [user_id, his_item, neg_item, label, len(his_item)...] train_set.append((reviewerID, hist_sel[::-1], neg_list[i*negsample+negi], 0, len(hist_sel[::-1]), hist_select.iloc[0]['gender'], hist_select.iloc[0]['identity'], hist_select.iloc[0]['edu_background'],hist_select.iloc[0]['logins'], hist_select.iloc[0]['grade'],hist_select.iloc[0]['experience'])) else: # 将最长的那一个序列长度作为测试数据 test_set.append((reviewerID, hist_sel[::-1], pos_list[i], 1, len(hist_sel[::-1]), hist_select.iloc[0]['gender'], hist_select.iloc[0]['identity'], hist_select.iloc[0]['edu_background'],hist_select.iloc[0]['logins'], hist_select.iloc[0]['grade'],hist_select.iloc[0]['experience'])) random.shuffle(train_set) random.shuffle(test_set) pickle.dump(train_set, open(subject_mind_train_set_data, 'wb')) pickle.dump(test_set, open(subject_mind_test_set_data, 'wb')) return train_set, test_set def gen_model_input(train_set, user_profile, seq_max_len): """ 将输入的数据进行padding,使得序列特征的长度都一致 """ train_uid = np.array([line[0] for line in train_set]) train_seq = [line[1] for line in train_set] train_iid = np.array([line[2] for line in train_set]) train_label = np.array([line[3] for line in train_set]) train_hist_len = np.array([line[4] for line in train_set]) train_gender = np.array([line[5] for line in train_set]) train_identity = np.array([line[6] for line in train_set]) train_edu_background = np.array([line[7] for line in train_set]) train_logins = np.array([line[8] for line in train_set]) train_grade = np.array([line[9] for line in train_set]) train_experience = np.array([line[10] for line in train_set]) train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0) train_model_input = {"user_id": train_uid, "subject_id": train_iid, "hist_subject_id": train_seq_pad, "hist_len": train_hist_len, "gender": train_gender, "identity": train_identity, "edu_background": train_edu_background, "logins":train_logins, "grade": train_grade, "experience": train_experience} return train_model_input, train_label def mind_recall(data, topk=20): # 定义特征 user_sparse_features = ['user_id'] user_dense_features = ['gender', 'identity', 'edu_background','logins','grade','experience'] item_sparse_features = ['subject_id'] # 用户选择序列的长度,短的填充,长的截断 if samples_mode == True: SEQ_LEN = 100 else: SEQ_LEN = 200 user_profile_dup = data[["user_id"]].drop_duplicates('user_id') item_profile_dup = data[["subject_id"]].drop_duplicates('subject_id') # 定义特征 sparse_features = user_sparse_features + item_sparse_features feature_max_idx = {} for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) feature_max_idx[feature] = data[feature].max() + 1 for feature in user_dense_features: min_max_scaler = MinMaxScaler() data[feature] = min_max_scaler.fit_transform(data[[feature]]) # 提取user和item的特征 user_profile = data[user_sparse_features + user_dense_features].drop_duplicates('user_id') item_profile = data[item_sparse_features].drop_duplicates('subject_id') user_index_2_rawid = dict(zip(user_profile['user_id'], user_profile_dup['user_id'])) item_index_2_rawid = dict(zip(item_profile['subject_id'], item_profile_dup['subject_id'])) # 生成训练集和测试集 # 由于深度学习需要的数据量非常大 # 为了保证召回的效果,通过滑窗的形式扩充训练样本 train_set, test_set = gen_data_set(data, 1) if os.path.exists(subject_mind_train_input_data) and \ os.path.exists(subject_mind_train_label_data): # 加载MIND模型的训练集数据 train_model_input = pickle.load(open(subject_mind_train_input_data, 'rb')) train_label = pickle.load(open(subject_mind_train_label_data, 'rb')) else: # 生成并保存MIND模型的训练集数据 train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) pickle.dump(train_model_input, open(subject_mind_train_input_data, 'wb')) pickle.dump(train_label, open(subject_mind_train_label_data, 'wb')) if os.path.exists(subject_mind_test_input_data) and \ os.path.exists(subject_mind_test_label_data): # 加载MIND模型的测试集数据 test_model_input = pickle.load(open(subject_mind_test_input_data, 'rb')) test_label = pickle.load(open(subject_mind_test_label_data, 'rb')) else: # 生成并保存MIND模型的测试集数据 test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) pickle.dump(test_model_input, open(subject_mind_test_input_data, 'wb')) pickle.dump(test_label, open(subject_mind_test_label_data, 'wb')) # 将数据整理成模型可以直接输入的形式 # 用户特征由稀疏特征user_id和可变长度特征hist_subject_id组成 user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), DenseFeat('gender', dimension=1), DenseFeat('identity', dimension=1), DenseFeat('edu_background', dimension=1), DenseFeat('logins', dimension=1), DenseFeat('grade', dimension=1), DenseFeat('experience', dimension=1), VarLenSparseFeat(SparseFeat('hist_subject_id', feature_max_idx['subject_id'], embedding_dim, embedding_name="subject_id"), SEQ_LEN, 'mean', 'hist_len'),] # 物品特征由稀疏特征subject_id组成 item_feature_columns = [SparseFeat('subject_id', feature_max_idx['subject_id'], embedding_dim)] # 进行负采样 train_counter = Counter(train_model_input['subject_id']) item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='subject_id', item_count=item_count) # 模型的定义 # num_sampled: 负采样时的样本数量 model = MIND(user_feature_columns,item_feature_columns, k_max=2, p=100,dynamic_k=False, sampler_config=sampler_config, user_dnn_hidden_units=(128, embedding_dim)) # 模型编译 model.compile(optimizer="adam", loss=sampledsoftmaxloss) # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练 history = model.fit(train_model_input, train_label, batch_size=256, epochs=5, verbose=1, validation_split=0.0) # 训练完模型之后,提取训练的Embedding,包括user端和item端 test_user_model_input = test_model_input # print(test_user_model_input.shape) all_item_model_input = {"subject_id": item_profile['subject_id'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) # 保存输出的item_embedding和user_embedding # 排序的时候用到,保存的时候需要和原始的id对应 user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) # embedding保存之前归一化一下 user_embs = user_embs / np.linalg.norm(user_embs, axis=1, keepdims=True) item_embs = item_embs / np.linalg.norm(item_embs, axis=1, keepdims=True) # MIND模型学到用户的两个兴趣胶囊向量 user_embs0 = user_embs[:, 0, :] user_embs1 = user_embs[:, 1, :] pickle.dump(user_embs0, open(subject_mind_user_embedding0_data, 'wb')) pickle.dump(user_embs1, open(subject_mind_user_embedding1_data, 'wb')) pickle.dump(item_embs, open(subject_mind_item_embedding_data, 'wb')) logger.info('构建mind item embedding HNSW索引') build_hnsw(item_embs, subject_mind_item_faiss_model_path, ef_construction, M, embedding_dim) logger.info('构建mind user embedding0 HNSW索引') build_hnsw(user_embs0, subject_mind_user0_faiss_model_path, ef_construction, M, embedding_dim) logger.info('构建mind user embedding1 HNSW索引') build_hnsw(user_embs1, subject_mind_user1_faiss_model_path, ef_construction, M, embedding_dim) # 用户第一个兴趣向量user_embs0 logger.info('用户第一个兴趣向量user_embs0') logger.info('将embedding转换成字典的形式方便查询') raw_user_id_emb0_dict = {user_index_2_rawid[k]: v for k, v in zip(user_profile['user_id'], user_embs0)} raw_item_id_emb_dict = {item_index_2_rawid[k]: v for k, v in zip(item_profile['subject_id'], item_embs)} pickle.dump(raw_user_id_emb0_dict, open(subject_mind_user_emb0_dict, 'wb')) pickle.dump(raw_item_id_emb_dict, open(subject_mind_item_emb_dict, 'wb')) # 生成user embedding和item embedding索引字典 logger.info("生成user embedding0和item embedding索引字典") mind_user_embedding0_index_dict = {k: user_index_2_rawid[v] for k, v in enumerate(user_profile['user_id'])} mind_item_embedding_index_dict = {k: item_index_2_rawid[v] for k, v in enumerate(item_profile['subject_id'])} pickle.dump(mind_user_embedding0_index_dict, open(subject_mind_user_embedding0_index_dict, 'wb')) pickle.dump(mind_item_embedding_index_dict, open(subject_mind_item_embedding_index_dict, 'wb')) # 定义用户召回字典 logger.info("定义用户兴趣召回字典") user0_recall_items_dict = collections.defaultdict(dict) if not os.path.exists(subject_mind_recall0_dict): # 加载保存的faiss索引 index = faiss.read_index(subject_mind_item_faiss_model_path) # 通过user_embedding去查询最相似的topk+1个item_embedding sim, idx = index.search(np.ascontiguousarray(user_embs0), topk+1) logger.info('生成mind所有用户第一个兴趣的召回列表') for target_idx, sim_value_list, rele_idx_list in tqdm(zip(test_user_model_input['user_id'], sim, idx)): target_raw_id = user_index_2_rawid[target_idx] # 从1开始是为了去掉物品本身 for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): rele_raw_id = item_index_2_rawid[rele_idx] user0_recall_items_dict[target_raw_id][rele_raw_id] = \ user0_recall_items_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value # 将召回的结果进行排序,直接得到召回的结果 user0_recall_items_dict = {k: sorted(v.items(), key=lambda x: x[1], reverse=True) \ for k, v in user0_recall_items_dict.items()} pickle.dump(user0_recall_items_dict, open(subject_mind_recall0_dict, 'wb')) # 用户第二个兴趣向量user_embs1 logger.info('用户第二个兴趣向量user_embs1') logger.info('将embedding转换成字典的形式方便查询') raw_user_id_emb1_dict = {user_index_2_rawid[k]: v for k, v in zip(user_profile['user_id'], user_embs1)} pickle.dump(raw_user_id_emb1_dict, open(subject_mind_user_emb1_dict, 'wb')) # 生成user embedding logger.info("生成user embedding0索引字典") mind_user_embedding1_index_dict = {k: user_index_2_rawid[v] for k, v in enumerate(user_profile['user_id'])} pickle.dump(mind_user_embedding1_index_dict, open(subject_mind_user_embedding1_index_dict, 'wb')) # 定义用户召回字典 logger.info("定义用户召回字典") user1_recall_items_dict = collections.defaultdict(dict) if not os.path.exists(subject_mind_recall1_dict): # 加载保存的faiss索引 index = faiss.read_index(subject_mind_item_faiss_model_path) # 通过user_embedding去查询最相似的topk+1个item_embedding sim, idx = index.search(np.ascontiguousarray(user_embs1), topk+1) logger.info('生成mind所有用户第二个兴趣的召回列表') for target_idx, sim_value_list, rele_idx_list in tqdm(zip(test_user_model_input['user_id'], sim, idx)): target_raw_id = user_index_2_rawid[target_idx] # 从1开始是为了去掉物品本身 for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): rele_raw_id = item_index_2_rawid[rele_idx] user1_recall_items_dict[target_raw_id][rele_raw_id] = \ user1_recall_items_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value # 将召回的结果进行排序,直接得到召回的结果 user1_recall_items_dict = {k: sorted(v.items(), key=lambda x: x[1], reverse=True) \ for k, v in user1_recall_items_dict.items()} pickle.dump(user1_recall_items_dict, open(subject_mind_recall1_dict, 'wb')) return user0_recall_items_dict,user1_recall_items_dict def build_hnsw(vecs, to_file, ef=2000, m=64, dim=100): """ 训练hnsw模型 """ vecs = vecs.astype('float32') # 构建索引 index = faiss.IndexFlatIP(dim) index.add(vecs) # 保存hnsw模型 faiss.write_index(index, to_file) return index def mind_recall_train(): """ mind召回训练和评估 """ # 需要召回的数量 recall_item_num = 100 logger.info("加载物品行为数据") all_select_df = get_all_select_df(offline=offline_mode) logger.info("加载物品属性数据") # item_merage_em = get_item_eminfo_df() item_info = get_item_info_df() logger.info("获取用户信息数据") users_info = get_user_info_df() all_select_df = all_select_df.merge(users_info, on='user_id') all_select_df = all_select_df.merge(item_info, on='subject_id') # 为了召回评估,提取最后一次选择作为召回评估 # 如果不需要做召回评估直接使用全量的训练集进行召回 if need_metric_recall: logger.info('获取物品行为数据历史和最后一次选择') train_hist_select_df, train_last_select_df = get_all_hist_and_last_select(all_select_df) else: train_hist_select_df = all_select_df # train_hist_select_df= train_hist_select_df.sample(frac=0.001) # MIND模型训练 user0_recall_items_dict ,user1_recall_items_dict = mind_recall(train_hist_select_df, topk=recall_item_num) # 召回效果评估 if samples_mode == True and need_metric_recall: logger.info('MIND用户第一个兴趣召回效果评估') metrics_recall(user0_recall_items_dict, train_last_select_df, topk=recall_item_num) logger.info('MIND用户第二个兴趣召回效果评估') metrics_recall(user1_recall_items_dict, train_last_select_df, topk=recall_item_num) if __name__ == '__main__': mind_recall_train()