You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

202 lines
8.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import sys
sys.path.append(os.getcwd())
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import warnings
from ranking.subject.rank_comm import save_rank_results
from deepctr.models import DIFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.models import save_model
import tensorflow as tf
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
from config import offline_mode, subject_features_save_path
from config import subject_train_user_item_feats
from config import subject_val_user_item_feats
from config import subject_test_user_item_feats
from config import subject_model_save_path
from config import subject_all_user_item_feats
from config import subject_rank_dense_fea
from config import subject_rank_sparse_fea
from config import subject_rank_feats_columns
from config import logger
from matching.subject.recall_comm import get_item_info_df
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
K.set_learning_phase(True)
if tf.__version__ >= '2.0.0':
tf.compat.v1.disable_eager_execution()
tqdm.pandas()
warnings.filterwarnings('ignore')
# 加这几行避免训练报错
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
def get_difm_feats_columns(df, dense_fea, sparse_fea, emb_dim=32):
"""
数据准备函数:
df: 数据集
dense_fea: 数值型特征列
sparse_fea: 离散型特征列
"""
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].max() + 1, embedding_dim=emb_dim)
for i, feat in enumerate(sparse_fea)] + [DenseFeat(feat, 1, )
for feat in dense_fea]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
x = {}
x = {name: df[name].values for name in feature_names}
return x, linear_feature_columns, dnn_feature_columns
if __name__ == '__main__':
logger.info('获取物品信息')
subject_info_df = get_item_info_df()
# 课程总数量
subjects_count = len(subject_info_df['subject_id'].unique().tolist())
logger.info('加载用户行为特征')
# 所有用户物品特征
all_user_item_feats_df = pd.read_csv(subject_all_user_item_feats, sep='\t', encoding='utf-8')
train_user_item_feats_df = pd.read_csv(subject_train_user_item_feats, sep='\t', encoding='utf-8')
train_user_item_feats_df['subject_id'] = train_user_item_feats_df['subject_id'].astype(int)
if offline_mode:
val_user_item_feats_df = pd.read_csv(subject_val_user_item_feats, sep='\t', encoding='utf-8')
val_user_item_feats_df['subject_id'] = val_user_item_feats_df['subject_id'].astype(int)
else:
val_user_item_feats_df = None
test_user_item_feats_df = pd.read_csv(subject_test_user_item_feats, sep='\t', encoding='utf-8')
test_user_item_feats_df['subject_id'] = test_user_item_feats_df['subject_id'].astype(int)
# 做特征的时候为了方便,给测试集也打上了一个无效的标签,这里直接删掉就行
del test_user_item_feats_df['label']
# 把特征分开
sparse_fea = subject_rank_sparse_fea
dense_fea = subject_rank_dense_fea
train_user_item_feats_df[dense_fea] = train_user_item_feats_df[dense_fea].fillna(0, )
if val_user_item_feats_df is not None:
val_user_item_feats_df[dense_fea] = val_user_item_feats_df[dense_fea].fillna(0, )
test_user_item_feats_df[dense_fea] = test_user_item_feats_df[dense_fea].fillna(0, )
# dense特征进行归一化
for feat in dense_fea:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit((all_user_item_feats_df[[feat]]))
pickle.dump(min_max_scaler, open(subject_model_save_path + 'min_max_scaler_' + feat + '.model', 'wb'))
train_user_item_feats_df[feat] = min_max_scaler.transform(train_user_item_feats_df[[feat]])
if val_user_item_feats_df is not None:
val_user_item_feats_df[feat] = min_max_scaler.transform(val_user_item_feats_df[[feat]])
test_user_item_feats_df[feat] = min_max_scaler.transform(test_user_item_feats_df[[feat]])
# sparse特征进行LabelEncoder
for feat in sparse_fea:
label_encoder = LabelEncoder()
if feat == 'subject_id':
label_encoder.fit(subject_info_df[[feat]])
subject_id_lable_encoder = label_encoder
else:
label_encoder.fit(all_user_item_feats_df[[feat]])
if feat == 'user_id':
user_id_label_encoder = label_encoder
pickle.dump(label_encoder, open(subject_model_save_path + feat + '_label_encoder.model', 'wb'))
train_user_item_feats_df[feat] = label_encoder.transform(train_user_item_feats_df[[feat]])
if val_user_item_feats_df is not None:
val_user_item_feats_df[feat] = label_encoder.transform(val_user_item_feats_df[[feat]])
test_user_item_feats_df[feat] = label_encoder.transform(test_user_item_feats_df[[feat]])
# 准备训练数据
x_train, linear_feature_columns, dnn_feature_columns = get_difm_feats_columns(
train_user_item_feats_df, dense_fea, sparse_fea)
y_train = train_user_item_feats_df['label'].values
if val_user_item_feats_df is not None:
# 准备验证数据
x_val, linear_feature_columns, dnn_feature_columns = get_difm_feats_columns(
val_user_item_feats_df, dense_fea, sparse_fea)
y_val = val_user_item_feats_df['label'].values
dense_fea = [x for x in dense_fea if x != 'label']
x_test, linear_feature_columns, dnn_feature_columns = get_difm_feats_columns(
test_user_item_feats_df, dense_fea, sparse_fea)
model = DIFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.summary()
model.compile(optimizer="Adam",
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['AUC']#评价指标AUC,精确度,均方误差
)
# 模型训练
if val_user_item_feats_df is not None:
history = model.fit(x_train,
y_train,
verbose=1,
epochs=5,
validation_data=(x_val, y_val),
batch_size=256)
else:
# 也可以使用下面的语句用自己采样出来的验证集
history = model.fit(x_train,
y_train,
verbose=1,
epochs=5,
validation_split=0.3,
batch_size=256)
# 保存训练好的DIFM模型
save_model(model, subject_model_save_path + 'difm_model.h5')
# 模型预测
test_user_item_feats_df['pred_score'] = model.predict(x_test, verbose=1, batch_size=256)
# 还原user_id和subject_id
test_user_item_feats_df['user_id'] = user_id_label_encoder.inverse_transform(test_user_item_feats_df[['user_id']])
test_user_item_feats_df['subject_id'] = subject_id_lable_encoder.inverse_transform(test_user_item_feats_df[['subject_id']])
item_info_df = get_item_info_df()
item_info_df = item_info_df[['subject_id', 'subject_name']]
test_user_item_feats_df = test_user_item_feats_df.merge(item_info_df, how='left', on='subject_id')
test_user_item_feats_df.sort_values(by=['user_id', 'pred_score'], ascending=[True, False], inplace=True)
test_user_item_feats_df['pred_rank'] = test_user_item_feats_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first').astype(int)
for feat in subject_rank_feats_columns:
min_max_scaler = pickle.load(open(subject_model_save_path + 'min_max_scaler_' + feat + '.model', 'rb'))
train_user_item_feats_df[feat] = min_max_scaler.inverse_transform(train_user_item_feats_df[[feat]])
if val_user_item_feats_df is not None:
val_user_item_feats_df[feat] = min_max_scaler.inverse_transform(val_user_item_feats_df[[feat]])
test_user_item_feats_df[feat] = min_max_scaler.inverse_transform(test_user_item_feats_df[[feat]])
test_user_item_feats_df[['user_id', 'subject_id', 'subject_name'] + subject_rank_feats_columns + ['pred_score', 'pred_rank']].to_csv(
subject_features_save_path + 'difm_rank_score.csv', sep='\t', index=False, header=True)