import os import sys sys.path.append(os.getcwd()) import pickle import pandas as pd from tqdm import tqdm from sklearn.preprocessing import LabelEncoder, MinMaxScaler import warnings from deepctr.models import DIFM from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names from tensorflow.keras import backend as K from tensorflow.keras.models import save_model import tensorflow as tf from tensorflow.compat.v1 import ConfigProto from tensorflow.compat.v1 import InteractiveSession from config import offline_mode, shixun_features_save_path from config import shixun_train_user_item_feats from config import shixun_val_user_item_feats from config import shixun_test_user_item_feats from config import shixun_model_save_path from config import shixun_all_user_item_feats from config import shixun_rank_dense_fea from config import shixun_rank_sparse_fea from config import shixun_rank_feats_columns from config import logger from matching.shixun.recall_comm import get_item_info_df os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "1" K.set_learning_phase(True) if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() tqdm.pandas() warnings.filterwarnings('ignore') # 加这几行避免训练报错 configs = ConfigProto() configs.gpu_options.allow_growth = True session = InteractiveSession(config=configs) def get_difm_feats_columns(df, dense_fea, sparse_fea, emb_dim=32): """ 数据准备函数: df: 数据集 dense_fea: 数值型特征列 sparse_fea: 离散型特征列 """ fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].max() + 1, embedding_dim=emb_dim) for i, feat in enumerate(sparse_fea)] + [DenseFeat(feat, 1, ) for feat in dense_fea] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) x = {} x = {name: df[name].values for name in feature_names} return x, linear_feature_columns, dnn_feature_columns if __name__ == '__main__': logger.info('获取物品信息') shixun_info_df = get_item_info_df() # 课程总数量 shixuns_count = len(shixun_info_df['shixun_id'].unique().tolist()) logger.info('加载用户行为特征') # 所有用户物品特征 all_user_item_feats_df = pd.read_csv(shixun_all_user_item_feats, sep='\t', encoding='utf-8') train_user_item_feats_df = pd.read_csv(shixun_train_user_item_feats, sep='\t', encoding='utf-8') train_user_item_feats_df['shixun_id'] = train_user_item_feats_df['shixun_id'].astype(int) if offline_mode: val_user_item_feats_df = pd.read_csv(shixun_val_user_item_feats, sep='\t', encoding='utf-8') val_user_item_feats_df['shixun_id'] = val_user_item_feats_df['shixun_id'].astype(int) else: val_user_item_feats_df = None test_user_item_feats_df = pd.read_csv(shixun_test_user_item_feats, sep='\t', encoding='utf-8') test_user_item_feats_df['shixun_id'] = test_user_item_feats_df['shixun_id'].astype(int) # 做特征的时候为了方便,给测试集也打上了一个无效的标签,这里直接删掉就行 del test_user_item_feats_df['label'] # 把特征分开 sparse_fea = shixun_rank_sparse_fea dense_fea = shixun_rank_dense_fea train_user_item_feats_df[dense_fea] = train_user_item_feats_df[dense_fea].fillna(0, ) if val_user_item_feats_df is not None: val_user_item_feats_df[dense_fea] = val_user_item_feats_df[dense_fea].fillna(0, ) test_user_item_feats_df[dense_fea] = test_user_item_feats_df[dense_fea].fillna(0, ) # dense特征进行归一化 for feat in dense_fea: min_max_scaler = MinMaxScaler() min_max_scaler.fit((all_user_item_feats_df[[feat]])) pickle.dump(min_max_scaler, open(shixun_model_save_path + 'min_max_scaler_' + feat + '.model', 'wb')) train_user_item_feats_df[feat] = min_max_scaler.transform(train_user_item_feats_df[[feat]]) if val_user_item_feats_df is not None: val_user_item_feats_df[feat] = min_max_scaler.transform(val_user_item_feats_df[[feat]]) test_user_item_feats_df[feat] = min_max_scaler.transform(test_user_item_feats_df[[feat]]) # sparse特征进行LabelEncoder for feat in sparse_fea: label_encoder = LabelEncoder() if feat == 'shixun_id': label_encoder.fit(shixun_info_df[[feat]]) shixun_id_lable_encoder = label_encoder else: label_encoder.fit(all_user_item_feats_df[[feat]]) if feat == 'user_id': user_id_label_encoder = label_encoder pickle.dump(label_encoder, open(shixun_model_save_path + feat + '_label_encoder.model', 'wb')) train_user_item_feats_df[feat] = label_encoder.transform(train_user_item_feats_df[[feat]]) if val_user_item_feats_df is not None: val_user_item_feats_df[feat] = label_encoder.transform(val_user_item_feats_df[[feat]]) test_user_item_feats_df[feat] = label_encoder.transform(test_user_item_feats_df[[feat]]) # 准备训练数据 x_train, linear_feature_columns, dnn_feature_columns = get_difm_feats_columns( train_user_item_feats_df, dense_fea, sparse_fea) y_train = train_user_item_feats_df['label'].values if val_user_item_feats_df is not None: # 准备验证数据 x_val, linear_feature_columns, dnn_feature_columns = get_difm_feats_columns( val_user_item_feats_df, dense_fea, sparse_fea) y_val = val_user_item_feats_df['label'].values dense_fea = [x for x in dense_fea if x != 'label'] x_test, linear_feature_columns, dnn_feature_columns = get_difm_feats_columns( test_user_item_feats_df, dense_fea, sparse_fea) model = DIFM(linear_feature_columns, dnn_feature_columns, task='binary') model.summary() model.compile(optimizer="Adam", loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['AUC']#评价指标:AUC,精确度,均方误差 ) # 模型训练 if val_user_item_feats_df is None:#is not None history = model.fit(x_train, y_train, verbose=1, epochs=5, validation_data=(x_val, y_val), batch_size=256) else: # 也可以使用下面的语句用自己采样出来的验证集 history = model.fit(x_train, y_train, verbose=1, epochs=5, validation_split=0.2, batch_size=256) # 保存训练好的difm模型 save_model(model, shixun_model_save_path + 'difm_model.h5') # 模型预测 test_user_item_feats_df['pred_score'] = model.predict(x_test, verbose=1, batch_size=256) # 还原user_id和shixun_id test_user_item_feats_df['user_id'] = user_id_label_encoder.inverse_transform(test_user_item_feats_df[['user_id']]) test_user_item_feats_df['shixun_id'] = shixun_id_lable_encoder.inverse_transform(test_user_item_feats_df[['shixun_id']]) item_info_df = get_item_info_df() item_info_df = item_info_df[['shixun_id', 'shixun_name']] test_user_item_feats_df = test_user_item_feats_df.merge(item_info_df, how='left', on='shixun_id') test_user_item_feats_df.sort_values(by=['user_id', 'pred_score'], ascending=[True, False], inplace=True) test_user_item_feats_df['pred_rank'] = test_user_item_feats_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first').astype(int) for feat in shixun_rank_feats_columns: min_max_scaler = pickle.load(open(shixun_model_save_path + 'min_max_scaler_' + feat + '.model', 'rb')) train_user_item_feats_df[feat] = min_max_scaler.inverse_transform(train_user_item_feats_df[[feat]]) if val_user_item_feats_df is not None: val_user_item_feats_df[feat] = min_max_scaler.inverse_transform(val_user_item_feats_df[[feat]]) test_user_item_feats_df[feat] = min_max_scaler.inverse_transform(test_user_item_feats_df[[feat]]) test_user_item_feats_df[['user_id', 'shixun_id', 'shixun_name'] + shixun_rank_feats_columns + ['pred_score', 'pred_rank']].to_csv( shixun_features_save_path + 'difm_rank_score.csv', sep='\t', index=False, header=True)