You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

152 lines
6.0 KiB

5 months ago
import os
import sys
sys.path.append(os.getcwd())
from datetime import datetime
import warnings
import pandas as pd
from tqdm import tqdm
import pickle
from matching.subject.recall_comm import get_item_info_df
from deepctr.layers import custom_objects
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
from tensorflow.python.keras.models import load_model
import tensorflow as tf
from config import subject_model_save_path
from config import logger
from config import subject_difm_rank_dict
from config import subject_rank_dense_fea
from config import subject_rank_sparse_fea
from config import mysubjects_data_path
from config import subject_features_save_path
from utils import get_user
from ranking.subject.difm_ranker_train import get_difm_feats_columns
from ranking.subject.user_recall_rank_features import init_rank_features
from ranking.subject.user_recall_rank_features import build_rank_features_offline
global graph, sess
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
K.set_learning_phase(True)
if tf.__version__ >= '2.0.0':
tf.compat.v1.disable_eager_execution()
graph = tf.compat.v1.get_default_graph()
sess = tf.compat.v1.keras.backend.get_session()
warnings.filterwarnings('ignore')
logger.info('获取物品信息')
item_info_df = get_item_info_df()
item_info_df = item_info_df[['subject_id', 'subject_name']].reset_index()
logger.info('加载DIFM排序模型')
difm_model = load_model(subject_model_save_path + 'difm_model.h5', custom_objects)
def difm_ranker_predict(user_item_feats_df, topk=10, verbose=True):
"""
DIFM模型预测接口
:param user_item_feats_df: 根据user_id生成的排序模型特征
:param topk: 返回排序后的topk个物品
"""
start_time = datetime.now()
# 稀疏特征
sparse_fea = subject_rank_sparse_fea
# 稠密连续型特征
dense_fea = subject_rank_dense_fea
# 填充缺失值
user_item_feats_df[dense_fea] = user_item_feats_df[dense_fea].fillna(0, )
# dense特征进行归一化
for feat in dense_fea:
min_max_scaler = pickle.load(open(subject_model_save_path + 'min_max_scaler_' + feat + '.model', 'rb'))
user_item_feats_df[feat] = min_max_scaler.transform(user_item_feats_df[[feat]])
# sparse特征LabelEncoder
for feat in sparse_fea:
label_encoder = pickle.load(open(subject_model_save_path + feat + '_label_encoder.model', 'rb'))
user_item_feats_df[feat] = label_encoder.transform(user_item_feats_df[[feat]])
if feat == 'subject_id':
subject_id_lable_encoder = label_encoder
if feat == 'user_id':
user_id_label_encoder = label_encoder
x, linear_feature_columns, dnn_feature_columns = get_difm_feats_columns(
user_item_feats_df, dense_fea, sparse_fea)
# 模型预测
with graph.as_default():
with sess.as_default():
user_item_feats_df['pred_score'] = difm_model.predict(x, verbose=1, batch_size=256)
# 还原user_id和subject_id
user_item_feats_df['user_id'] = user_id_label_encoder.inverse_transform(user_item_feats_df[['user_id']])
user_item_feats_df['subject_id'] = subject_id_lable_encoder.inverse_transform(user_item_feats_df[['subject_id']])
# 按预测分数降序排序
rank_results = user_item_feats_df[['user_id', 'subject_id', 'pred_score']]
rank_results['user_id'] = rank_results['user_id'].astype(int)
rank_results['subject_id'] = rank_results['subject_id'].astype(int)
rank_results = rank_results.merge(item_info_df, how='left', on='subject_id')
rank_results = rank_results[['user_id', 'subject_id', 'subject_name', 'pred_score']]
rank_results.sort_values(by=['pred_score'], ascending=False, inplace=True)
rank_results['pred_rank'] = rank_results['pred_score'].rank(ascending=False, method='first').astype(int)
rank_results = rank_results[:topk]
# 计算耗时毫秒
end_time = datetime.utcnow()
cost_time_millisecond = round(float((end_time - start_time).microseconds / 1000.0), 3)
if verbose:
logger.info(f"DIFM 预测耗时: {cost_time_millisecond} 毫秒")
return rank_results
def alluser_difm_ranker_predict():
"""
生成所有用户召回物品离线特征排序后的字典
"""
init_rank_features()
# all_user_item_feats_df = pd.read_csv(subject_all_user_item_feats, sep = '\t', encoding = 'utf-8')
recall_rank_list_lict = {}
# all_user_ids = all_user_item_feats_df['user_id'].unique()
all_user_ids = get_user(mysubjects_data_path)
for user_id in tqdm(all_user_ids):
user_id = int(user_id)
recall_rank_list_lict.setdefault(user_id, [])
user_item_feats_df = build_rank_features_offline(user_id)
if user_item_feats_df.shape[0] == 0:
continue
rank_results = difm_ranker_predict(user_item_feats_df,
topk=user_item_feats_df.shape[0],
verbose=False)
for subject_id, subject_name in zip(rank_results['subject_id'], rank_results['subject_name']):
recall_rank_list_lict[user_id].append((subject_id, subject_name))
pickle.dump(recall_rank_list_lict, open(subject_difm_rank_dict, 'wb'))
if __name__ == '__main__':
user_item_feats_df = pd.read_csv(subject_features_save_path + 'user_item_feats_df.csv', sep='\t')
tmp_user_item_feats_df = user_item_feats_df.merge(item_info_df, how='left', on='subject_id')
logger.info('DIFM排序之前的数据:')
print(tmp_user_item_feats_df[['user_id', 'subject_id', 'subject_name', 'score', 'rank']][:20])
rank_results = difm_ranker_predict(user_item_feats_df, topk=user_item_feats_df.shape[0])
logger.info('DIFM排序之后的数据:')
print(rank_results[['user_id', 'subject_id', 'subject_name', 'pred_score', 'pred_rank']][:20])
alluser_difm_ranker_predict()