You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

267 lines
12 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import sys
sys.path.append(os.getcwd())
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import warnings
from ranking.subject.rank_comm import save_rank_results
from deepctr.models import BST
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.models import save_model
import tensorflow as tf
from config import offline_mode, subject_features_save_path
from config import subject_train_user_item_feats
from config import subject_val_user_item_feats
from config import subject_test_user_item_feats
from config import subject_model_save_path
from config import subject_all_user_item_feats
from config import logger
from config import subject_rank_dense_fea
from config import subject_rank_sparse_fea
from config import subject_max_seq_len
from config import subject_rank_feats_columns
from matching.subject.recall_comm import get_all_select_df
from matching.subject.recall_comm import get_item_info_df
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
K.set_learning_phase(True)
if tf.__version__ >= '2.0.0':
tf.compat.v1.disable_eager_execution()
tqdm.pandas()
warnings.filterwarnings('ignore')
# 数据准备函数
def get_bst_feats_columns(df,
dense_fea,
sparse_fea,
behavior_fea,
his_behavior_fea,
emb_dim=32,
max_len=100):
"""
数据准备函数:
df: 数据集
dense_fea: 数值型特征列
sparse_fea: 离散型特征列
behavior_fea: 用户的候选行为特征列
his_behavior_fea: 用户的历史行为特征列
embedding_dim: embedding的维度这里为了简单统一把离散型特征列采用一样的隐向量维度
max_len: 用户序列的最大长度
"""
sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1,
embedding_dim = emb_dim) for feat in sparse_fea]
dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]
var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['subject_id'].nunique() + 1,
embedding_dim=emb_dim, embedding_name='subject_id'), maxlen=max_len)
for feat in his_behavior_fea]
dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
# 建立x, x是一个字典的形式
x = {}
for name in get_feature_names(dnn_feature_columns):
if name in his_behavior_fea:
# 这是历史行为序列
his_list = [l for l in df[name]]
# 二维数组
x[name] = pad_sequences(his_list, maxlen=max_len, padding='post')
else:
x[name] = df[name].values
return x, dnn_feature_columns
if __name__ == '__main__':
logger.info('加载物品行为数据')
all_data = get_all_select_df()
logger.info('生成用户历史选择物品数据')
hist_select = all_data[['user_id', 'subject_id']].groupby('user_id').agg({list}).reset_index()
his_behavior_df = pd.DataFrame()
his_behavior_df['user_id'] = hist_select['user_id']
his_behavior_df['hist_subject_id'] = hist_select['subject_id']
logger.info('获取物品信息')
subject_info_df = get_item_info_df()
# 物品总数量
subjects_count = len(subject_info_df['subject_id'].unique().tolist())
logger.info('加载用户行为特征')
# 所有用户物品特征
all_user_item_feats_df = pd.read_csv(subject_all_user_item_feats, sep='\t', encoding='utf-8')
# subject_id转换成int
train_user_item_feats_df = pd.read_csv(subject_train_user_item_feats, sep='\t', encoding='utf-8')
train_user_item_feats_df['subject_id'] = train_user_item_feats_df['subject_id'].astype(int)
if offline_mode:
val_user_item_feats_df = pd.read_csv(subject_val_user_item_feats, sep='\t', encoding='utf-8')
val_user_item_feats_df['subject_id'] = val_user_item_feats_df['subject_id'].astype(int)
else:
val_user_item_feats_df = None
test_user_item_feats_df = pd.read_csv(subject_test_user_item_feats, sep='\t', encoding='utf-8')
test_user_item_feats_df['subject_id'] = test_user_item_feats_df['subject_id'].astype(int)
# 做特征的时候为了方便,给测试集也打上了一个无效的标签,这里直接删掉
del test_user_item_feats_df['label']
train_user_item_feats_df = train_user_item_feats_df.merge(his_behavior_df, on='user_id')
if offline_mode:
val_user_item_feats_df = val_user_item_feats_df.merge(his_behavior_df, on='user_id')
else:
val_user_item_feats_df = None
test_user_item_feats_df = test_user_item_feats_df.merge(his_behavior_df, on='user_id')
# 把特征分开
sparse_fea = subject_rank_sparse_fea
behavior_fea = ['subject_id']
hist_behavior_fea = ['hist_subject_id']
dense_fea = subject_rank_dense_fea
train_user_item_feats_df[dense_fea] = train_user_item_feats_df[dense_fea].fillna(0, )
if val_user_item_feats_df is not None:
val_user_item_feats_df[dense_fea] = val_user_item_feats_df[dense_fea].fillna(0, )
test_user_item_feats_df[dense_fea] = test_user_item_feats_df[dense_fea].fillna(0, )
# 处理inf的值
train_user_item_feats_df.replace([np.inf, -np.inf], 0, inplace=True)
test_user_item_feats_df.replace([np.inf, -np.inf], 0, inplace=True)
# dense特征进行归一化
for feat in dense_fea:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit((all_user_item_feats_df[[feat]]))
pickle.dump(min_max_scaler, open(subject_model_save_path + 'min_max_scaler_' + feat + '.model', 'wb'))
train_user_item_feats_df[feat] = min_max_scaler.transform(train_user_item_feats_df[[feat]])
if val_user_item_feats_df is not None:
val_user_item_feats_df[feat] = min_max_scaler.transform(val_user_item_feats_df[[feat]])
test_user_item_feats_df[feat] = min_max_scaler.transform(test_user_item_feats_df[[feat]])
# sparse特征进行LabelEncoder
for feat in sparse_fea:
label_encoder = LabelEncoder()
if feat == 'subject_id':
label_encoder.fit(subject_info_df[[feat]])
subject_id_lable_encoder = label_encoder
else:
label_encoder.fit(all_user_item_feats_df[[feat]])
if feat == 'user_id':
user_id_label_encoder = label_encoder
pickle.dump(label_encoder, open(subject_model_save_path + feat + '_label_encoder.model', 'wb'))
train_user_item_feats_df[feat] = label_encoder.transform(train_user_item_feats_df[[feat]])
if val_user_item_feats_df is not None:
val_user_item_feats_df[feat] = label_encoder.transform(val_user_item_feats_df[[feat]])
test_user_item_feats_df[feat] = label_encoder.transform(test_user_item_feats_df[[feat]])
# 准备训练数据
x_train, dnn_feature_columns = get_bst_feats_columns(train_user_item_feats_df,
dense_fea,
sparse_fea,
behavior_fea,
hist_behavior_fea,
max_len=subject_max_seq_len)
y_train = train_user_item_feats_df['label'].values
if val_user_item_feats_df is not None:
# 准备验证数据
x_val, dnn_feature_columns = get_bst_feats_columns(val_user_item_feats_df,
dense_fea,
sparse_fea,
behavior_fea,
hist_behavior_fea,
max_len=subject_max_seq_len)
y_val = val_user_item_feats_df['label'].values
dense_fea = [x for x in dense_fea if x != 'label']
x_test, dnn_feature_columns = get_bst_feats_columns(test_user_item_feats_df,
dense_fea,
sparse_fea,
behavior_fea,
hist_behavior_fea,
max_len=subject_max_seq_len)
# 建立模型
model = BST(dnn_feature_columns, behavior_fea)
# 查看模型结构
model.summary()
# 模型编译
model.compile(optimizer="Adam",
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['AUC']#评价指标AUC,精确度,均方误差
)
# 模型训练
if val_user_item_feats_df is not None:
history = model.fit(x_train,
y_train,
verbose=1,
epochs=5,
validation_data=(x_val, y_val),
batch_size=256)
else:
# 也可以使用下面的语句用自己采样出来的验证集
history = model.fit(x_train,
y_train,
verbose=1,
epochs=5,
validation_split=0.3,
batch_size=256)
# 保存训练好的BST模型
# save_model(model, subject_model_save_path + 'bst_model.h5')
model.save(subject_model_save_path + 'bst_model.h5')
# 模型预测
test_user_item_feats_df['pred_score'] = model.predict(x_test, verbose=1, batch_size=256)
# 还原user_id和subject_id
test_user_item_feats_df['user_id'] = user_id_label_encoder.inverse_transform(test_user_item_feats_df[['user_id']])
test_user_item_feats_df['subject_id'] = subject_id_lable_encoder.inverse_transform(test_user_item_feats_df[['subject_id']])
item_info_df = get_item_info_df()
item_info_df = item_info_df[['subject_id', 'subject_name']]
test_user_item_feats_df = test_user_item_feats_df.merge(item_info_df, how='left', on='subject_id')
test_user_item_feats_df.sort_values(by=['user_id', 'pred_score'], ascending=[True, False], inplace=True)
test_user_item_feats_df['pred_rank'] = test_user_item_feats_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first').astype(int)
for feat in subject_rank_feats_columns:
min_max_scaler = pickle.load(open(subject_model_save_path + 'min_max_scaler_' + feat + '.model', 'rb'))
train_user_item_feats_df[feat] = min_max_scaler.inverse_transform(train_user_item_feats_df[[feat]])
if val_user_item_feats_df is not None:
val_user_item_feats_df[feat] = min_max_scaler.inverse_transform(val_user_item_feats_df[[feat]])
test_user_item_feats_df[feat] = min_max_scaler.inverse_transform(test_user_item_feats_df[[feat]])
test_user_item_feats_df[['user_id', 'subject_id', 'subject_name'] + subject_rank_feats_columns + ['pred_score', 'pred_rank']].to_csv(
subject_features_save_path + 'bst_rank_score.csv', sep='\t', index=False, header=True)