|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import pickle
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
import warnings
|
|
|
|
|
from utils import reduce_mem
|
|
|
|
|
|
|
|
|
|
from config import logger
|
|
|
|
|
from config import samples_mode
|
|
|
|
|
from config import shixun_features_save_path
|
|
|
|
|
from config import myshixuns_train_data, myshixuns_test_data
|
|
|
|
|
from config import shixun_itemcf_recall_dict
|
|
|
|
|
from config import shixun_item_embedding_recall_dict
|
|
|
|
|
from config import shixun_youtubednn_recall_dict
|
|
|
|
|
from config import shixun_youtubednn_usercf_recall_dict
|
|
|
|
|
from config import shixun_dssm_recall_dict
|
|
|
|
|
from config import shixun_pinsage_recall_dict
|
|
|
|
|
from config import shixun_final_recall_items_dict
|
|
|
|
|
# from config import shixuns_bert_emb_dict
|
|
|
|
|
from config import shixun_item_w2v_emb_dict
|
|
|
|
|
from config import shixun_train_user_item_feats
|
|
|
|
|
from config import shixun_val_user_item_feats
|
|
|
|
|
from config import shixun_test_user_item_feats
|
|
|
|
|
from config import shixun_youtube_user_emb_dict
|
|
|
|
|
from config import shixun_youtube_item_emb_dict
|
|
|
|
|
from config import shixun_all_user_item_feats
|
|
|
|
|
from config import shixun_dssm_item_emb_dict
|
|
|
|
|
from config import shixun_dssm_user_emb_dict
|
|
|
|
|
from config import offline_mode
|
|
|
|
|
|
|
|
|
|
from matching.shixun.recall_comm import get_item_info_df
|
|
|
|
|
from matching.shixun.recall_comm import get_hist_and_last_select
|
|
|
|
|
from matching.shixun.recall_comm import get_user_info_df
|
|
|
|
|
from ranking.shixun.rank_comm import fill_is_trainee_hab
|
|
|
|
|
from ranking.shixun.rank_comm import get_rank_item_info_dict
|
|
|
|
|
# from ranking.shixun.rank_comm import get_item_bert_emb_dict
|
|
|
|
|
|
|
|
|
|
tqdm.pandas()
|
|
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
|
def train_val_split(all_select_df, sample_rate=0.3):
|
|
|
|
|
"""
|
|
|
|
|
划分训练集和验证集
|
|
|
|
|
:param all_select_df: 指的是训练集
|
|
|
|
|
:param sample_rate: 采样作为验证集的用户比率
|
|
|
|
|
"""
|
|
|
|
|
all_select = all_select_df
|
|
|
|
|
all_user_ids = all_select.user_id.unique()
|
|
|
|
|
|
|
|
|
|
# replace = True表示可以重复抽样,反之不可以
|
|
|
|
|
sample_user_ids = np.random.choice(all_user_ids, size=int(len(all_user_ids) * sample_rate), replace=False)
|
|
|
|
|
|
|
|
|
|
select_val = all_select[all_select['user_id'].isin(sample_user_ids)]
|
|
|
|
|
select_train = all_select[~all_select['user_id'].isin(sample_user_ids)]
|
|
|
|
|
|
|
|
|
|
# 将验证集中的最后一次选择给抽取出来作为答案
|
|
|
|
|
select_val = select_val.sort_values(['user_id', 'created_timestamp'])
|
|
|
|
|
val_ans = select_val.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
select_val = select_val.groupby('user_id').progress_apply(lambda x: x[:-1]).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
# 如果该用户只有一个选择数据,又被分到ans中
|
|
|
|
|
# 止方法保证答案中出现的用户再验证集中还有
|
|
|
|
|
val_ans = val_ans[val_ans.user_id.isin(select_val.user_id.unique())]
|
|
|
|
|
select_val = select_val[select_val.user_id.isin(val_ans.user_id.unique())]
|
|
|
|
|
|
|
|
|
|
return select_train, select_val, val_ans
|
|
|
|
|
|
|
|
|
|
def get_train_val_test_data(offline=True):
|
|
|
|
|
"""
|
|
|
|
|
读取训练、验证、测试集
|
|
|
|
|
"""
|
|
|
|
|
if offline:
|
|
|
|
|
select_train_data = pd.read_csv(myshixuns_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
select_train, select_val, val_ans=train_val_split(select_train_data, sample_rate=0.3)
|
|
|
|
|
else:
|
|
|
|
|
select_train = pd.read_csv(myshixuns_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
select_val = None
|
|
|
|
|
val_ans = None
|
|
|
|
|
|
|
|
|
|
select_test = pd.read_csv(myshixuns_test_data, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
return select_train, select_val, select_test, val_ans
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_recall_list(single_recall_model=None, multi_recall=False):
|
|
|
|
|
"""
|
|
|
|
|
返回多路召回列表或者部分单路召回列表
|
|
|
|
|
"""
|
|
|
|
|
if multi_recall:
|
|
|
|
|
return pickle.load(open(shixun_final_recall_items_dict, 'rb'))
|
|
|
|
|
|
|
|
|
|
if single_recall_model == 'i2i_itemcf':
|
|
|
|
|
return pickle.load(open(shixun_itemcf_recall_dict, 'rb'))
|
|
|
|
|
|
|
|
|
|
elif single_recall_model == 'i2i_emb_itemcf':
|
|
|
|
|
return pickle.load(open(shixun_item_embedding_recall_dict, 'rb'))
|
|
|
|
|
|
|
|
|
|
elif single_recall_model == 'user_cf':
|
|
|
|
|
return pickle.load(open(shixun_youtubednn_usercf_recall_dict, 'rb'))
|
|
|
|
|
|
|
|
|
|
elif single_recall_model == 'youtubednn':
|
|
|
|
|
return pickle.load(open(shixun_youtubednn_recall_dict, 'rb'))
|
|
|
|
|
|
|
|
|
|
elif single_recall_model == 'dssm':
|
|
|
|
|
return pickle.load(open(shixun_dssm_recall_dict, 'rb'))
|
|
|
|
|
|
|
|
|
|
elif single_recall_model == 'pinsage':
|
|
|
|
|
return pickle.load(open(shixun_pinsage_recall_dict, 'rb'))
|
|
|
|
|
|
|
|
|
|
def get_embedding():
|
|
|
|
|
"""
|
|
|
|
|
通过字典查询对应的item的Embedding
|
|
|
|
|
"""
|
|
|
|
|
#获取shixun的bert embedding字典
|
|
|
|
|
# if os.path.exists(shixuns_bert_emb_dict):
|
|
|
|
|
# item_bert_emb_dict = pickle.load(open(shixuns_bert_emb_dict, 'rb'))
|
|
|
|
|
# else:
|
|
|
|
|
# item_bert_emb_dict = get_item_bert_emb_dict()
|
|
|
|
|
|
|
|
|
|
# w2v Embedding是需要提前训练好的
|
|
|
|
|
if os.path.exists(shixun_item_w2v_emb_dict):
|
|
|
|
|
item_word2vec_emb_dict = pickle.load(open(shixun_item_w2v_emb_dict, 'rb'))
|
|
|
|
|
else:
|
|
|
|
|
print(os.path.basename(shixun_item_w2v_emb_dict) + ' file not exist.')
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_youtube_item_emb_dict):
|
|
|
|
|
item_youtube_emb_dict = pickle.load(open(shixun_youtube_item_emb_dict, 'rb'))
|
|
|
|
|
else:
|
|
|
|
|
print(os.path.basename(shixun_youtube_item_emb_dict) + 'file not exist.')
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_youtube_user_emb_dict):
|
|
|
|
|
user_youtube_emb_dict = pickle.load(open(shixun_youtube_user_emb_dict, 'rb'))
|
|
|
|
|
else:
|
|
|
|
|
print(os.path.basename(shixun_youtube_user_emb_dict) + 'file not exist.')
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_dssm_item_emb_dict):
|
|
|
|
|
item_dssm_emb_dict = pickle.load(open(shixun_dssm_item_emb_dict, 'rb'))
|
|
|
|
|
else:
|
|
|
|
|
print(os.path.basename(shixun_dssm_item_emb_dict) + 'file not exist.')
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_dssm_user_emb_dict):
|
|
|
|
|
user_dssm_emb_dict = pickle.load(open(shixun_dssm_user_emb_dict, 'rb'))
|
|
|
|
|
else:
|
|
|
|
|
print(os.path.basename(shixun_dssm_user_emb_dict) + 'file not exist.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return item_word2vec_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict,item_dssm_emb_dict,user_dssm_emb_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def recall_dict_2_df(recall_list_dict):
|
|
|
|
|
"""
|
|
|
|
|
召回列表转换成DataFrame形式, [user, item, score]
|
|
|
|
|
"""
|
|
|
|
|
df_row_list = []
|
|
|
|
|
|
|
|
|
|
for user, recall_list in tqdm(recall_list_dict.items()):
|
|
|
|
|
for item, score in recall_list:
|
|
|
|
|
df_row_list.append([user, item, score])
|
|
|
|
|
|
|
|
|
|
col_names = ['user_id', 'sim_item', 'score']
|
|
|
|
|
recall_list_df = pd.DataFrame(df_row_list, columns=col_names)
|
|
|
|
|
return recall_list_df
|
|
|
|
|
|
|
|
|
|
def neg_sample_recall_data(recall_items_df, sample_rate=0.05):
|
|
|
|
|
"""
|
|
|
|
|
负采样函数,可以控制负采样时的比例, 这里给了一个默认的值
|
|
|
|
|
"""
|
|
|
|
|
logger.info('采样之前数据')
|
|
|
|
|
|
|
|
|
|
pos_data = recall_items_df[recall_items_df['label'] == 1]
|
|
|
|
|
neg_data = recall_items_df[recall_items_df['label'] == 0]
|
|
|
|
|
|
|
|
|
|
print('正样本数量:', len(pos_data), '负样本数量:', len(neg_data),
|
|
|
|
|
'正样本比率:', round(len(pos_data)/(len(pos_data) + len(neg_data)), 6))
|
|
|
|
|
|
|
|
|
|
# 分组采样函数
|
|
|
|
|
def neg_sample_func(group_df):
|
|
|
|
|
neg_num = len(group_df)
|
|
|
|
|
|
|
|
|
|
# 保证最少有一个
|
|
|
|
|
sample_num = max(int(neg_num * sample_rate), 1)
|
|
|
|
|
|
|
|
|
|
# 保证最多不超过20个,这里可以根据实际情况进行选择
|
|
|
|
|
sample_num = min(sample_num, 20)
|
|
|
|
|
|
|
|
|
|
return group_df.sample(n=sample_num, replace=True)
|
|
|
|
|
|
|
|
|
|
# 对用户进行负采样,保证所有用户都在采样后的数据中
|
|
|
|
|
neg_data_user_sample = neg_data.groupby('user_id', group_keys=False). \
|
|
|
|
|
progress_apply(neg_sample_func)
|
|
|
|
|
|
|
|
|
|
# 对物品进行负采样,保证所有物品都在采样后的数据中
|
|
|
|
|
neg_data_item_sample = neg_data.groupby('sim_item', group_keys=False). \
|
|
|
|
|
progress_apply(neg_sample_func)
|
|
|
|
|
|
|
|
|
|
# 将上述两种情况下的采样数据合并
|
|
|
|
|
neg_data_new = neg_data_user_sample.append(neg_data_item_sample)
|
|
|
|
|
|
|
|
|
|
# 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重
|
|
|
|
|
neg_data_new = neg_data_new.sort_values(['user_id', 'score']).drop_duplicates(
|
|
|
|
|
['user_id', 'sim_item'], keep='last')
|
|
|
|
|
|
|
|
|
|
# 将正样本数据合并
|
|
|
|
|
data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)
|
|
|
|
|
|
|
|
|
|
logger.info('采样之后数据')
|
|
|
|
|
|
|
|
|
|
pos_data = data_new[data_new['label'] == 1]
|
|
|
|
|
neg_data = data_new[data_new['label'] == 0]
|
|
|
|
|
|
|
|
|
|
print('正样本数量:', len(pos_data), '负样本数量:', len(neg_data),
|
|
|
|
|
'正样本比率:', round(len(pos_data)/(len(pos_data) + len(neg_data)), 4))
|
|
|
|
|
|
|
|
|
|
return data_new
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sample_test_recall_data(recall_items_df, sample_rate=0.05):
|
|
|
|
|
"""
|
|
|
|
|
测试样采样函数,可以控制采样的比例, 这里给了一个默认的值
|
|
|
|
|
"""
|
|
|
|
|
logger.info('采样之前样本数量:' + str(len(recall_items_df)))
|
|
|
|
|
|
|
|
|
|
# 分组采样函数
|
|
|
|
|
def neg_sample_func(group_df):
|
|
|
|
|
neg_num = len(group_df)
|
|
|
|
|
|
|
|
|
|
# 保证最少有一个
|
|
|
|
|
sample_num = max(int(neg_num * sample_rate), 1)
|
|
|
|
|
|
|
|
|
|
# 保证最多不超过20个,这里可以根据实际情况进行选择
|
|
|
|
|
sample_num = min(sample_num, 20)
|
|
|
|
|
|
|
|
|
|
return group_df.sample(n=sample_num, replace=True)
|
|
|
|
|
|
|
|
|
|
# 对用户进行负采样,保证所有用户都在采样后的数据中
|
|
|
|
|
data_user_sample = recall_items_df.groupby('user_id', group_keys=False). \
|
|
|
|
|
progress_apply(neg_sample_func)
|
|
|
|
|
|
|
|
|
|
# 对物品进行负采样,保证所有物品都在采样后的数据中
|
|
|
|
|
data_item_sample = recall_items_df.groupby('sim_item', group_keys=False). \
|
|
|
|
|
progress_apply(neg_sample_func)
|
|
|
|
|
|
|
|
|
|
# 将上述两种情况下的采样数据合并
|
|
|
|
|
data_new = data_user_sample.append(data_item_sample)
|
|
|
|
|
|
|
|
|
|
# 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重
|
|
|
|
|
data_new = data_new.sort_values(['user_id', 'score']).drop_duplicates(
|
|
|
|
|
['user_id', 'sim_item'], keep='last')
|
|
|
|
|
|
|
|
|
|
logger.info('采样之后样本数量:' + str(len(data_new)))
|
|
|
|
|
|
|
|
|
|
return data_new
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_rank_label_df(recall_list_df, label_df, is_test=False):
|
|
|
|
|
"""
|
|
|
|
|
召回数据打标签
|
|
|
|
|
"""
|
|
|
|
|
# 测试集是没有标签了,为了后面代码统一一些,这里直接给一个负数替代
|
|
|
|
|
if is_test:
|
|
|
|
|
recall_list_df['label'] = -1
|
|
|
|
|
return recall_list_df
|
|
|
|
|
|
|
|
|
|
label_df = label_df.rename(columns={'shixun_id': 'sim_item'})
|
|
|
|
|
|
|
|
|
|
recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'sim_item', 'created_timestamp']], \
|
|
|
|
|
how='left', on=['user_id', 'sim_item'])
|
|
|
|
|
|
|
|
|
|
recall_list_df_['label'] = recall_list_df_['created_timestamp'].progress_apply(
|
|
|
|
|
lambda x: 0 if np.isnan(x) else 1)
|
|
|
|
|
del recall_list_df_['created_timestamp']
|
|
|
|
|
|
|
|
|
|
return recall_list_df_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_recall_item_label_df(select_train_hist,
|
|
|
|
|
select_val_hist,
|
|
|
|
|
select_test_hist,
|
|
|
|
|
select_train_last,
|
|
|
|
|
select_val_last,
|
|
|
|
|
recall_list_df):
|
|
|
|
|
"""
|
|
|
|
|
获取用户召回列表训练,验证,测试集的标签
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# 获取训练数据的召回列表
|
|
|
|
|
train_user_items_df = recall_list_df[recall_list_df['user_id'].isin(select_train_hist['user_id'].unique())]
|
|
|
|
|
|
|
|
|
|
logger.info('训练集数据打标签')
|
|
|
|
|
train_user_item_label_df = get_rank_label_df(train_user_items_df, select_train_last, is_test=False)
|
|
|
|
|
|
|
|
|
|
logger.info('训练集数据负采样')
|
|
|
|
|
train_user_item_label_df = neg_sample_recall_data(train_user_item_label_df)
|
|
|
|
|
|
|
|
|
|
if select_val_hist is not None:
|
|
|
|
|
val_user_items_df = recall_list_df[recall_list_df['user_id'].isin(select_val_hist['user_id'].unique())]
|
|
|
|
|
|
|
|
|
|
logger.info('验证集数据打标签')
|
|
|
|
|
val_user_item_label_df = get_rank_label_df(val_user_items_df, select_val_last, is_test=False)
|
|
|
|
|
|
|
|
|
|
logger.info('验证集数据负采样')
|
|
|
|
|
val_user_item_label_df = neg_sample_recall_data(val_user_item_label_df)
|
|
|
|
|
else:
|
|
|
|
|
val_user_item_label_df = None
|
|
|
|
|
|
|
|
|
|
# 测试集数据进行随机采样,减少生成特征的时间
|
|
|
|
|
test_user_items_df = recall_list_df[recall_list_df['user_id'].isin(select_test_hist['user_id'].unique())]
|
|
|
|
|
|
|
|
|
|
logger.info('测试集数据打标签')
|
|
|
|
|
test_user_item_label_df = get_rank_label_df(test_user_items_df, None, is_test=True)
|
|
|
|
|
|
|
|
|
|
logger.info('测试集数据随机采样')
|
|
|
|
|
test_user_item_label_df = sample_test_recall_data(test_user_item_label_df)
|
|
|
|
|
|
|
|
|
|
return train_user_item_label_df, val_user_item_label_df, test_user_item_label_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_tuple_func(group_df):
|
|
|
|
|
"""
|
|
|
|
|
将最终的召回的df数据转换成字典的形式做排序特征
|
|
|
|
|
"""
|
|
|
|
|
row_data = []
|
|
|
|
|
for name, row_df in group_df.iterrows():
|
|
|
|
|
row_data.append((row_df['sim_item'], row_df['score'], row_df['label']))
|
|
|
|
|
|
|
|
|
|
return row_data
|
|
|
|
|
|
|
|
|
|
def get_cos_similar_matrix(v1, v2):
|
|
|
|
|
#获取两个向量的余弦相似度
|
|
|
|
|
num = np.dot(v1, v2) # 向量点乘
|
|
|
|
|
denom = np.linalg.norm(v1).reshape(-1) * np.linalg.norm(v2).reshape(-1) # 求模长的乘积
|
|
|
|
|
res = num / denom
|
|
|
|
|
res[np.isneginf(res)] = 0.0 #负无穷大的赋值0
|
|
|
|
|
# return num
|
|
|
|
|
return float(0.5 + 0.5 * res)
|
|
|
|
|
|
|
|
|
|
def create_behavior_feature(users_id,
|
|
|
|
|
recall_list,
|
|
|
|
|
select_hist_df,
|
|
|
|
|
shixuns_info,
|
|
|
|
|
shixun_info_dict,
|
|
|
|
|
shixuns_emb,
|
|
|
|
|
user_emb=None,
|
|
|
|
|
N=1):
|
|
|
|
|
"""
|
|
|
|
|
基于用户历史行为生成相关特征
|
|
|
|
|
:param users_id: 用户id
|
|
|
|
|
:param recall_list: 对于每个用户召回的候选物品列表
|
|
|
|
|
:param select_hist_df: 用户历史选择的物品
|
|
|
|
|
:param shixuns_info: 物品信息
|
|
|
|
|
:param shixuns_emb: 物品的embedding向量,可以用item_bert_emb, item_w2v_emb, youtube_item_emb,dssm_item_emb
|
|
|
|
|
:param user_emb: 用户的embedding向量,可以是youtube_user_emb, 也可以不用,
|
|
|
|
|
如果要传的话shixuns_emb就要用youtube_item_emb,保持维度一样
|
|
|
|
|
:param N: 最近的N次选择,由于行为日志里面很多用户只存在一次历史选择,为了不产生空值,默认为1
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
shixuns_info['shixun_id'] = shixuns_info['shixun_id'].astype(int)
|
|
|
|
|
select_hist_df['user_id'] = select_hist_df['user_id'].astype(int)
|
|
|
|
|
|
|
|
|
|
# 建立一个二维列表保存结果, 后面要转成DataFrame
|
|
|
|
|
all_user_feas = []
|
|
|
|
|
shixun_id_list = shixuns_info['shixun_id'].values.tolist()
|
|
|
|
|
|
|
|
|
|
for user_id in tqdm(users_id):
|
|
|
|
|
|
|
|
|
|
# 该用户的最后N次选择
|
|
|
|
|
hist_user_items = select_hist_df[select_hist_df['user_id']==user_id]['shixun_id'][-N:]
|
|
|
|
|
|
|
|
|
|
# 遍历该用户的召回列表
|
|
|
|
|
for rank, (shixun_id, score, label) in enumerate(recall_list[user_id]):
|
|
|
|
|
# 不在物品信息中的跳过,以免报错
|
|
|
|
|
if shixun_id not in shixun_id_list:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
shixun_id = int(shixun_id)
|
|
|
|
|
cur_shixuns_info = shixun_info_dict[shixun_id]
|
|
|
|
|
|
|
|
|
|
# 实训建立时间, 访问次数,选择人数,关卡数量, 难易程度,平均星数,通过人数
|
|
|
|
|
a_create_time = cur_shixuns_info[0][0]
|
|
|
|
|
a_trainee = cur_shixuns_info[0][1]
|
|
|
|
|
a_visits_count = cur_shixuns_info[0][2]
|
|
|
|
|
a_myshixuns_count = cur_shixuns_info[0][3]
|
|
|
|
|
a_challenges_count = cur_shixuns_info[0][4]
|
|
|
|
|
a_averge_star = cur_shixuns_info[0][5]
|
|
|
|
|
a_task_pass = cur_shixuns_info[0][6]
|
|
|
|
|
single_user_fea = [user_id, shixun_id]
|
|
|
|
|
|
|
|
|
|
# 计算与最后选择的物品的相似度的和,最大值、最小值、均值
|
|
|
|
|
sim_fea = []
|
|
|
|
|
time_fea = []
|
|
|
|
|
visits_fea = []
|
|
|
|
|
myshixuns_fea = []
|
|
|
|
|
challenges_fea = []
|
|
|
|
|
trainee_fea = []
|
|
|
|
|
averge_star_fea = []
|
|
|
|
|
task_pass_fea = []
|
|
|
|
|
|
|
|
|
|
# 遍历用户的最后N次选择物品
|
|
|
|
|
for hist_item in hist_user_items:
|
|
|
|
|
if (hist_item not in shixun_id_list):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
hist_item = int(hist_item)
|
|
|
|
|
hist_shixuns_info = shixun_info_dict[hist_item]
|
|
|
|
|
|
|
|
|
|
b_create_time = hist_shixuns_info[0][0]
|
|
|
|
|
b_trainee = hist_shixuns_info[0][1]
|
|
|
|
|
b_visits_count = hist_shixuns_info[0][2]
|
|
|
|
|
b_myshixuns_count = hist_shixuns_info[0][3]
|
|
|
|
|
b_challenges_count = hist_shixuns_info[0][4]
|
|
|
|
|
b_averge_star = hist_shixuns_info[0][5]
|
|
|
|
|
b_task_pass = hist_shixuns_info[0][6]
|
|
|
|
|
|
|
|
|
|
if (hist_item not in shixuns_emb) or (shixun_id not in shixuns_emb):
|
|
|
|
|
sim_fea.append(0.0)
|
|
|
|
|
else:
|
|
|
|
|
# sim_fea.append(np.dot(shixuns_emb[hist_item], shixuns_emb[shixun_id]))
|
|
|
|
|
sim_fea.append(get_cos_similar_matrix(shixuns_emb[hist_item], shixuns_emb[shixun_id]))#余弦相似度
|
|
|
|
|
|
|
|
|
|
time_fea.append(abs(a_create_time - b_create_time))
|
|
|
|
|
visits_fea.append(abs(a_visits_count - b_visits_count))
|
|
|
|
|
myshixuns_fea.append(abs(a_myshixuns_count - b_myshixuns_count))
|
|
|
|
|
challenges_fea.append(abs(a_challenges_count - b_challenges_count))
|
|
|
|
|
trainee_fea.append(abs(a_trainee - b_trainee))
|
|
|
|
|
averge_star_fea.append(abs(a_averge_star - b_averge_star))
|
|
|
|
|
task_pass_fea.append(abs(a_task_pass-b_task_pass))
|
|
|
|
|
|
|
|
|
|
if (len(sim_fea) != 0) and (len(time_fea) != 0) and (len(visits_fea) != 0) and \
|
|
|
|
|
(len(myshixuns_fea) != 0) and (len(challenges_fea) != 0) and \
|
|
|
|
|
(len(trainee_fea) != 0) and (len(averge_star_fea) != 0)and (len(task_pass_fea) != 0):
|
|
|
|
|
|
|
|
|
|
# 相似性特征
|
|
|
|
|
single_user_fea.extend(sim_fea)
|
|
|
|
|
|
|
|
|
|
# 时间差特征
|
|
|
|
|
single_user_fea.extend(time_fea)
|
|
|
|
|
|
|
|
|
|
# 访问次数差特征
|
|
|
|
|
single_user_fea.extend(visits_fea)
|
|
|
|
|
|
|
|
|
|
# 选课人数差特征
|
|
|
|
|
single_user_fea.extend(myshixuns_fea)
|
|
|
|
|
|
|
|
|
|
# 关卡数量差特征
|
|
|
|
|
single_user_fea.extend(challenges_fea)
|
|
|
|
|
|
|
|
|
|
# 难易程度差特征
|
|
|
|
|
single_user_fea.extend(trainee_fea)
|
|
|
|
|
|
|
|
|
|
# 平均星数差特征
|
|
|
|
|
single_user_fea.extend(averge_star_fea)
|
|
|
|
|
|
|
|
|
|
#通过人数差特征
|
|
|
|
|
single_user_fea.extend(task_pass_fea)
|
|
|
|
|
|
|
|
|
|
# 相似性的统计特征
|
|
|
|
|
single_user_fea.extend([max(sim_fea), min(sim_fea), sum(sim_fea), sum(sim_fea) / len(sim_fea)])
|
|
|
|
|
|
|
|
|
|
if user_emb: # 如果用户向量有的话,这里计算该召回物品与用户的相似性特征
|
|
|
|
|
if (user_id not in user_emb) or (shixun_id not in shixuns_emb):
|
|
|
|
|
single_user_fea.append(0.0)
|
|
|
|
|
else:
|
|
|
|
|
single_user_fea.append(np.dot(user_emb[user_id], shixuns_emb[shixun_id]))
|
|
|
|
|
|
|
|
|
|
single_user_fea.extend([score, rank, label])
|
|
|
|
|
|
|
|
|
|
# 加入到总的表中
|
|
|
|
|
all_user_feas.append(single_user_fea)
|
|
|
|
|
|
|
|
|
|
# 定义交叉特征
|
|
|
|
|
id_cols = ['user_id', 'shixun_id']
|
|
|
|
|
sim_cols = ['sim' + str(i) for i in range(N)]
|
|
|
|
|
time_cols = ['time_diff' + str(i) for i in range(N)]
|
|
|
|
|
vists_cols = ['visit_diff' + str(i) for i in range(N)]
|
|
|
|
|
myshixuns_cols = ['myshixuns_diff' + str(i) for i in range(N)]
|
|
|
|
|
challenge_cols = ['challenges_diff' + str(i) for i in range(N)]
|
|
|
|
|
trainee_cols = ['trainee_diff' + str(i) for i in range(N)]
|
|
|
|
|
averge_star_cols = ['averge_star_diff' + str(i) for i in range(N)]
|
|
|
|
|
task_pass_cols = ['task_pass_diff' + str(i) for i in range(N)]
|
|
|
|
|
|
|
|
|
|
sat_cols = ['sim_max', 'sim_min', 'sim_sum', 'sim_mean']
|
|
|
|
|
user_item_sim_cols = ['user_item_sim'] if user_emb else []
|
|
|
|
|
user_score_rank_label = ['score', 'rank', 'label']
|
|
|
|
|
|
|
|
|
|
# 交叉特征列表
|
|
|
|
|
cols = id_cols + sim_cols + time_cols + vists_cols + myshixuns_cols + challenge_cols \
|
|
|
|
|
+ trainee_cols + averge_star_cols + task_pass_cols + sat_cols + user_item_sim_cols + user_score_rank_label
|
|
|
|
|
|
|
|
|
|
# 转成DataFrame
|
|
|
|
|
features_df = pd.DataFrame(all_user_feas, columns=cols)
|
|
|
|
|
|
|
|
|
|
return features_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def active_level(all_data, cols):
|
|
|
|
|
"""
|
|
|
|
|
生成用户活跃度的特征
|
|
|
|
|
根据用户选择物品时间和选择物品的次数生成用户活跃度
|
|
|
|
|
如果用户选择物品之间的时间间隔比较小,同时选择的物品次数很多,就认为此用户是活跃用户
|
|
|
|
|
1. 首先根据user_id分组, 对于每个用户计算选择物品的次数,两两选择物品时间间隔的均值
|
|
|
|
|
2. 把选择次数取倒数和时间间隔的均值统一归一化,然后两者相加合并,该值越小说明用户越活跃
|
|
|
|
|
3. 注意:上面两两选择物品的时间间隔均值,会出现如果用户只选择了一次的情况,
|
|
|
|
|
这时候时间间隔均值那里会出现空值,对于这种情况最后特征那里给个大数进行区分
|
|
|
|
|
"""
|
|
|
|
|
if os.path.exists(shixun_features_save_path + 'user_active_level.csv'):
|
|
|
|
|
user_act = pd.read_csv(shixun_features_save_path + 'user_active_level.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
return user_act
|
|
|
|
|
|
|
|
|
|
data = all_data[cols]
|
|
|
|
|
data.sort_values(['user_id', 'created_timestamp'], inplace=True)
|
|
|
|
|
|
|
|
|
|
user_act = pd.DataFrame(data.groupby('user_id', as_index=False)[['shixun_id', 'created_timestamp']].\
|
|
|
|
|
agg({'shixun_id':np.size, 'created_timestamp': {list}}).values, \
|
|
|
|
|
columns=['user_id', 'select_size', 'created_timestamp'])
|
|
|
|
|
|
|
|
|
|
# 计算时间间隔的均值
|
|
|
|
|
def time_diff_mean(l):
|
|
|
|
|
if len(l) == 1:
|
|
|
|
|
return 1
|
|
|
|
|
else:
|
|
|
|
|
return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])
|
|
|
|
|
|
|
|
|
|
user_act['user_time_diff_mean'] = user_act['created_timestamp'].progress_apply(lambda x: time_diff_mean(x))
|
|
|
|
|
|
|
|
|
|
# 选择次数取倒数
|
|
|
|
|
user_act['select_size'] = 1 / user_act['select_size']
|
|
|
|
|
|
|
|
|
|
# 两者归一化
|
|
|
|
|
user_act['select_size'] = (user_act['select_size'] - user_act['select_size'].min()) / \
|
|
|
|
|
(user_act['select_size'].max() - user_act['select_size'].min())
|
|
|
|
|
|
|
|
|
|
user_act['user_time_diff_mean'] = (user_act['user_time_diff_mean'] - user_act['user_time_diff_mean'].min()) / \
|
|
|
|
|
(user_act['user_time_diff_mean'].max() - user_act['user_time_diff_mean'].min())
|
|
|
|
|
|
|
|
|
|
user_act['active_level'] = user_act['select_size'] + user_act['user_time_diff_mean']
|
|
|
|
|
|
|
|
|
|
user_act['user_id'] = user_act['user_id'].astype('int')
|
|
|
|
|
del user_act['created_timestamp']
|
|
|
|
|
|
|
|
|
|
user_act.to_csv(shixun_features_save_path + 'user_active_level.csv', index=False, header=True, sep='\t')
|
|
|
|
|
|
|
|
|
|
return user_act
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def hot_level(all_data, cols):
|
|
|
|
|
"""
|
|
|
|
|
生成物品热度的特征
|
|
|
|
|
根据物品选择时间和被选择物品的次数来衡量物品热度特征
|
|
|
|
|
如果物品在很短的时间间隔之内被选择了很多次,说明物品比较热门
|
|
|
|
|
1. 根据物品进行分组,对于每个物品的用户,计算选择的时间间隔
|
|
|
|
|
2. 将用户的数量取倒数,然后用户的数量和时间间隔归一化,相加得到热度特征
|
|
|
|
|
该值越小说明被选择的次数越大且时间间隔越短,物品比较热门
|
|
|
|
|
"""
|
|
|
|
|
if os.path.exists(shixun_features_save_path + 'shixun_hot_level.csv'):
|
|
|
|
|
shixun_hot = pd.read_csv(shixun_features_save_path + 'shixun_hot_level.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
return shixun_hot
|
|
|
|
|
|
|
|
|
|
data = all_data[cols]
|
|
|
|
|
data.sort_values(['shixun_id', 'created_timestamp'], inplace=True)
|
|
|
|
|
|
|
|
|
|
shixun_hot = pd.DataFrame(data.groupby('shixun_id', as_index=False) \
|
|
|
|
|
[['user_id', 'created_timestamp']]. \
|
|
|
|
|
agg({'user_id': np.size, 'created_timestamp': {list}}).values, \
|
|
|
|
|
columns=['shixun_id', 'user_num', 'created_timestamp'])
|
|
|
|
|
|
|
|
|
|
# 计算被选择时间间隔的均值
|
|
|
|
|
def time_diff_mean(l):
|
|
|
|
|
if len(l) == 1:
|
|
|
|
|
return 1
|
|
|
|
|
else:
|
|
|
|
|
return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])
|
|
|
|
|
|
|
|
|
|
shixun_hot['item_time_diff_mean'] = shixun_hot['created_timestamp']. \
|
|
|
|
|
progress_apply(lambda x: time_diff_mean(x))
|
|
|
|
|
|
|
|
|
|
# 选择次数取倒数
|
|
|
|
|
shixun_hot['user_num'] = 1 / shixun_hot['user_num']
|
|
|
|
|
|
|
|
|
|
# 两者归一化
|
|
|
|
|
shixun_hot['user_num'] = (shixun_hot['user_num'] - shixun_hot['user_num'].min()) /\
|
|
|
|
|
(shixun_hot['user_num'].max() - shixun_hot['user_num'].min())
|
|
|
|
|
|
|
|
|
|
shixun_hot['item_time_diff_mean'] = (shixun_hot['item_time_diff_mean'] - shixun_hot['item_time_diff_mean'].min()) /\
|
|
|
|
|
(shixun_hot['item_time_diff_mean'].max() - shixun_hot['item_time_diff_mean'].min())
|
|
|
|
|
|
|
|
|
|
shixun_hot['hot_level'] = shixun_hot['user_num'] + shixun_hot['item_time_diff_mean']
|
|
|
|
|
|
|
|
|
|
shixun_hot['shixun_id'] = shixun_hot['shixun_id'].astype('int')
|
|
|
|
|
del shixun_hot['created_timestamp']
|
|
|
|
|
|
|
|
|
|
shixun_hot.to_csv(shixun_features_save_path + 'shixun_hot_level.csv', index=False, header=True, sep='\t')
|
|
|
|
|
|
|
|
|
|
return shixun_hot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def user_time_hob_fea(all_data, cols):
|
|
|
|
|
"""
|
|
|
|
|
生成用户的时间习惯特征
|
|
|
|
|
根据用户选择的历史物品的时间做统计求均值
|
|
|
|
|
可以看出用户习惯一天什么时候选择物品
|
|
|
|
|
"""
|
|
|
|
|
user_time_hob_info = all_data[cols]
|
|
|
|
|
|
|
|
|
|
# 先把时间戳进行归一化
|
|
|
|
|
mm = MinMaxScaler()
|
|
|
|
|
user_time_hob_info['created_timestamp'] = mm.fit_transform(user_time_hob_info[['created_timestamp']])
|
|
|
|
|
user_time_hob_info['created_at_ts'] = mm.fit_transform(user_time_hob_info[['created_at_ts']])
|
|
|
|
|
|
|
|
|
|
user_time_hob_info = user_time_hob_info.groupby('user_id').agg('mean').reset_index()
|
|
|
|
|
|
|
|
|
|
user_time_hob_info.rename(columns={'created_timestamp': 'user_time_hob1',
|
|
|
|
|
'created_at_ts': 'user_time_hob2'}, inplace=True)
|
|
|
|
|
return user_time_hob_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def user_trainee_hob_fea(all_data, cols):
|
|
|
|
|
"""
|
|
|
|
|
用户的选择的实训难易爱好
|
|
|
|
|
根据用户选择的实训难易度转成一个列表
|
|
|
|
|
后面汇总的时候再单独制作一个特征,如果难度在这里面为1否则为0
|
|
|
|
|
"""
|
|
|
|
|
user_category_hob_info = all_data[cols]
|
|
|
|
|
user_category_hob_info['trainee'] = user_category_hob_info['trainee'].astype(str)
|
|
|
|
|
user_category_hob_info = user_category_hob_info.groupby('user_id').agg({set}).reset_index()
|
|
|
|
|
|
|
|
|
|
user_trainee_hob_info = pd.DataFrame()
|
|
|
|
|
user_trainee_hob_info['user_id'] = user_category_hob_info['user_id']
|
|
|
|
|
user_trainee_hob_info['trainee_list'] = user_category_hob_info['trainee']
|
|
|
|
|
|
|
|
|
|
return user_trainee_hob_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_rank_features_engineering():
|
|
|
|
|
"""
|
|
|
|
|
排序模型特征工程
|
|
|
|
|
"""
|
|
|
|
|
logger.info('获取训练验证测试数据集')
|
|
|
|
|
|
|
|
|
|
# offline和online的区别就是验证集是否为空
|
|
|
|
|
# online时select_val, val_ans为空
|
|
|
|
|
select_train, select_val, select_test, val_ans = get_train_val_test_data(offline=offline_mode)
|
|
|
|
|
|
|
|
|
|
logger.info('获取用户历史选择和最后一次选择')
|
|
|
|
|
select_train_hist, select_train_last = get_hist_and_last_select(select_train)
|
|
|
|
|
|
|
|
|
|
if select_val is not None:
|
|
|
|
|
select_val_hist, select_val_last = select_val, val_ans
|
|
|
|
|
else:
|
|
|
|
|
select_val_hist, select_val_last = None, None
|
|
|
|
|
|
|
|
|
|
select_test_hist = select_test
|
|
|
|
|
|
|
|
|
|
# 读取离线召回数据
|
|
|
|
|
# 全量数据时只选择pinsage召回的结果
|
|
|
|
|
# 增量数据时选择多路召回合并的结果
|
|
|
|
|
logger.info('获取召回列表数据')
|
|
|
|
|
recall_list_dict = get_recall_list(single_recall_model='pinsage', multi_recall=samples_mode)
|
|
|
|
|
|
|
|
|
|
logger.info('召回数据转换成DataFrame...')
|
|
|
|
|
recall_list_df = recall_dict_2_df(recall_list_dict)
|
|
|
|
|
|
|
|
|
|
logger.info('给训练验证测试数据集打标签,负采样...')
|
|
|
|
|
train_user_item_label_df, val_user_item_label_df, test_user_item_label_df=\
|
|
|
|
|
get_user_recall_item_label_df(select_train_hist,
|
|
|
|
|
select_val_hist,
|
|
|
|
|
select_test_hist,
|
|
|
|
|
select_train_last,
|
|
|
|
|
select_val_last,
|
|
|
|
|
recall_list_df)
|
|
|
|
|
|
|
|
|
|
logger.info('召回数据转换成字典')
|
|
|
|
|
train_user_item_label_tuples = train_user_item_label_df.groupby('user_id'). \
|
|
|
|
|
progress_apply(make_tuple_func).reset_index()
|
|
|
|
|
|
|
|
|
|
train_user_item_label_tuples_dict = dict(zip(train_user_item_label_tuples['user_id'],
|
|
|
|
|
train_user_item_label_tuples[0]))
|
|
|
|
|
|
|
|
|
|
if val_user_item_label_df is not None:
|
|
|
|
|
val_user_item_label_tuples = val_user_item_label_df.groupby('user_id'). \
|
|
|
|
|
progress_apply(make_tuple_func).reset_index()
|
|
|
|
|
|
|
|
|
|
val_user_item_label_tuples_dict = dict(zip(val_user_item_label_tuples['user_id'],
|
|
|
|
|
val_user_item_label_tuples[0]))
|
|
|
|
|
else:
|
|
|
|
|
val_user_item_label_tuples_dict = None
|
|
|
|
|
|
|
|
|
|
test_user_item_label_tuples = test_user_item_label_df.groupby('user_id'). \
|
|
|
|
|
progress_apply(make_tuple_func).reset_index()
|
|
|
|
|
|
|
|
|
|
test_user_item_label_tuples_dict = dict(zip(test_user_item_label_tuples['user_id'],
|
|
|
|
|
test_user_item_label_tuples[0]))
|
|
|
|
|
|
|
|
|
|
logger.info("获取用户信息")
|
|
|
|
|
users_info = get_user_info_df()
|
|
|
|
|
|
|
|
|
|
# 用到的用户信息特征
|
|
|
|
|
users_info = users_info[['user_id', 'gender', 'school_id',
|
|
|
|
|
'identity','edu_background', 'logins', 'grade', 'experience']]
|
|
|
|
|
|
|
|
|
|
logger.info('获取物品信息')
|
|
|
|
|
shixun_info_df = get_item_info_df()
|
|
|
|
|
|
|
|
|
|
# 用到的物品信息特征
|
|
|
|
|
shixun_info_df = shixun_info_df[['shixun_id', 'visits', 'trainee',
|
|
|
|
|
'myshixuns_count', 'challenges_count', 'averge_star','task_pass', 'created_at_ts']]
|
|
|
|
|
|
|
|
|
|
logger.info('生成物品信息字典')
|
|
|
|
|
shixun_info_dict = get_rank_item_info_dict(shixun_info_df)
|
|
|
|
|
|
|
|
|
|
logger.info('获取物品向量化特征')
|
|
|
|
|
item_word2vec_emb_dict, item_youtube_emb_dict,\
|
|
|
|
|
user_youtube_emb_dict,item_dssm_emb_dict,user_dssm_emb_dict = get_embedding()
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_features_save_path + 'train_user_item_behavior_feats_df.csv'):
|
|
|
|
|
train_user_item_feats_df = pd.read_csv(shixun_features_save_path
|
|
|
|
|
+ 'train_user_item_behavior_feats_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(train_user_item_feats_df)
|
|
|
|
|
else:
|
|
|
|
|
logger.info('生成训练数据集中物品交叉特征')
|
|
|
|
|
train_user_item_feats_df = create_behavior_feature(train_user_item_label_tuples_dict.keys(),
|
|
|
|
|
train_user_item_label_tuples_dict,
|
|
|
|
|
select_train_hist,
|
|
|
|
|
shixun_info_df,
|
|
|
|
|
shixun_info_dict,
|
|
|
|
|
item_word2vec_emb_dict)
|
|
|
|
|
train_user_item_feats_df.to_csv(shixun_features_save_path
|
|
|
|
|
+ 'train_user_item_behavior_feats_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
reduce_mem(train_user_item_feats_df)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_features_save_path + 'val_user_item_behavior_feats_df.csv'):
|
|
|
|
|
val_user_item_feats_df = pd.read_csv(shixun_features_save_path
|
|
|
|
|
+ 'val_user_item_behavior_feats_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(val_user_item_feats_df)
|
|
|
|
|
else:
|
|
|
|
|
if val_user_item_label_tuples_dict is not None:
|
|
|
|
|
logger.info('生成验证数据集中物品交叉特征')
|
|
|
|
|
val_user_item_feats_df = create_behavior_feature(val_user_item_label_tuples_dict.keys(),
|
|
|
|
|
val_user_item_label_tuples_dict,
|
|
|
|
|
select_val_hist,
|
|
|
|
|
shixun_info_df,
|
|
|
|
|
shixun_info_dict,
|
|
|
|
|
item_word2vec_emb_dict)
|
|
|
|
|
else:
|
|
|
|
|
val_user_item_feats_df = None
|
|
|
|
|
|
|
|
|
|
if val_user_item_feats_df is not None:
|
|
|
|
|
val_user_item_feats_df.to_csv(shixun_features_save_path
|
|
|
|
|
+ 'val_user_item_behavior_feats_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
reduce_mem(val_user_item_feats_df)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_features_save_path + 'test_user_item_behavior_feats_df.csv'):
|
|
|
|
|
test_user_item_feats_df = pd.read_csv(shixun_features_save_path
|
|
|
|
|
+ 'test_user_item_behavior_feats_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(test_user_item_feats_df)
|
|
|
|
|
else:
|
|
|
|
|
logger.info('生成测试数据集中物品交叉特征')
|
|
|
|
|
test_user_item_feats_df = create_behavior_feature(test_user_item_label_tuples_dict.keys(),
|
|
|
|
|
test_user_item_label_tuples_dict,
|
|
|
|
|
select_test_hist,
|
|
|
|
|
shixun_info_df,
|
|
|
|
|
shixun_info_dict,
|
|
|
|
|
item_word2vec_emb_dict)
|
|
|
|
|
test_user_item_feats_df.to_csv(shixun_features_save_path
|
|
|
|
|
+ 'test_user_item_behavior_feats_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
reduce_mem(test_user_item_feats_df)
|
|
|
|
|
|
|
|
|
|
# 物品行为数据,就是前面的所有数据
|
|
|
|
|
if select_val is not None:
|
|
|
|
|
all_data = select_train.append(select_val)
|
|
|
|
|
|
|
|
|
|
all_data = select_train.append(select_test)
|
|
|
|
|
|
|
|
|
|
# 拼上物品信息
|
|
|
|
|
all_data = all_data.merge(shixun_info_df, on='shixun_id', how='left')
|
|
|
|
|
|
|
|
|
|
logger.info('生成用户活跃度特征')
|
|
|
|
|
user_act_fea = active_level(all_data, ['user_id', 'shixun_id', 'created_timestamp'])
|
|
|
|
|
|
|
|
|
|
logger.info('生成物品热度特征')
|
|
|
|
|
shixun_hot_fea = hot_level(all_data, ['user_id', 'shixun_id', 'created_timestamp'])
|
|
|
|
|
|
|
|
|
|
# 用户时间特征
|
|
|
|
|
user_time_hob_cols = ['user_id', 'created_timestamp', 'created_at_ts']
|
|
|
|
|
user_time_hob_info = user_time_hob_fea(all_data, user_time_hob_cols)
|
|
|
|
|
|
|
|
|
|
# 用户选择的实训难度特征
|
|
|
|
|
user_category_hob_cols = ['user_id', 'trainee']
|
|
|
|
|
user_trainee_hob_info = user_trainee_hob_fea(all_data, user_category_hob_cols)
|
|
|
|
|
|
|
|
|
|
# 用户选择的实训访问次数特征
|
|
|
|
|
user_visits_count_info = all_data.groupby('user_id')['visits'].agg('mean').reset_index()
|
|
|
|
|
user_visits_count_info.rename(columns={'visits': 'visits_hbo'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
# 用户选择的实训选用次数特征
|
|
|
|
|
user_myshixuns_count_info = all_data.groupby('user_id')['myshixuns_count'].agg('mean').reset_index()
|
|
|
|
|
user_myshixuns_count_info.rename(columns={'myshixuns_count': 'myshixuns_hbo'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
# 用户选择的实训关卡数量特征
|
|
|
|
|
user_challenges_count_info = all_data.groupby('user_id')['challenges_count'].agg('mean').reset_index()
|
|
|
|
|
user_challenges_count_info.rename(columns={'challenges_count': 'challenges_hbo'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
# 用户选择的实训评价星数特征
|
|
|
|
|
user_averge_star_info = all_data.groupby('user_id')['averge_star'].agg('mean').reset_index()
|
|
|
|
|
user_averge_star_info.rename(columns={'averge_star': 'averge_star_hbo'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
# 用户选择的实训通过人数特征
|
|
|
|
|
user_task_pass_info = all_data.groupby('user_id')['task_pass'].agg('mean').reset_index()
|
|
|
|
|
user_task_pass_info.rename(columns={'task_pass': 'task_pass_hbo'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
# 用户选择的实训数量特征
|
|
|
|
|
user_shixun_num_info = all_data.groupby('user_id')['shixun_id'].agg([('shixun_num', 'count')]).reset_index()
|
|
|
|
|
user_shixun_num_info.rename(columns={'shixun_num': 'seq_length'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
logger.info('合并用户特征')
|
|
|
|
|
user_features = pd.merge(user_act_fea, user_time_hob_info, on='user_id')
|
|
|
|
|
user_features = user_features.merge(user_trainee_hob_info, on='user_id')
|
|
|
|
|
user_features = user_features.merge(user_visits_count_info, on='user_id')
|
|
|
|
|
user_features = user_features.merge(user_myshixuns_count_info, on='user_id')
|
|
|
|
|
user_features = user_features.merge(user_challenges_count_info, on='user_id')
|
|
|
|
|
user_features = user_features.merge(user_averge_star_info, on='user_id')
|
|
|
|
|
user_features = user_features.merge(user_task_pass_info, on='user_id')
|
|
|
|
|
user_features = user_features.merge(user_shixun_num_info, on='user_id')
|
|
|
|
|
|
|
|
|
|
# 合并用户人口学统计特征
|
|
|
|
|
user_features = user_features.merge(users_info, on='user_id', how='left')
|
|
|
|
|
|
|
|
|
|
logger.info('保存用户特征')
|
|
|
|
|
user_features.to_csv(shixun_features_save_path + 'user_features_df.csv',
|
|
|
|
|
sep='\t', header=True, index=False)
|
|
|
|
|
|
|
|
|
|
logger.info('拼接用户特征')
|
|
|
|
|
train_user_item_feats_df = train_user_item_feats_df.merge(user_features, on='user_id', how='left')
|
|
|
|
|
|
|
|
|
|
if val_user_item_feats_df is not None:
|
|
|
|
|
val_user_item_feats_df = val_user_item_feats_df.merge(user_features, on='user_id', how='left')
|
|
|
|
|
else:
|
|
|
|
|
val_user_item_feats_df = None
|
|
|
|
|
|
|
|
|
|
test_user_item_feats_df = test_user_item_feats_df.merge(user_features, on='user_id',how='left')
|
|
|
|
|
|
|
|
|
|
logger.info('拼接物品特征')
|
|
|
|
|
train_user_item_feats_df = train_user_item_feats_df.merge(shixun_info_df, on='shixun_id', how='left')
|
|
|
|
|
train_user_item_feats_df = train_user_item_feats_df.merge(shixun_hot_fea, on='shixun_id', how='left')
|
|
|
|
|
|
|
|
|
|
if val_user_item_feats_df is not None:
|
|
|
|
|
val_user_item_feats_df = val_user_item_feats_df.merge(shixun_info_df, on='shixun_id', how='left')
|
|
|
|
|
val_user_item_feats_df = val_user_item_feats_df.merge(shixun_hot_fea, on='shixun_id', how='left')
|
|
|
|
|
else:
|
|
|
|
|
val_user_item_feats_df = None
|
|
|
|
|
|
|
|
|
|
test_user_item_feats_df = test_user_item_feats_df.merge(shixun_info_df, on='shixun_id', how='left')
|
|
|
|
|
test_user_item_feats_df = test_user_item_feats_df.merge(shixun_hot_fea, on='shixun_id', how='left')
|
|
|
|
|
|
|
|
|
|
# 是否在用户选择的实训难度中
|
|
|
|
|
train_user_item_feats_df['is_trainee_hab'] = train_user_item_feats_df.progress_apply(
|
|
|
|
|
lambda x: fill_is_trainee_hab(x), axis=1)
|
|
|
|
|
|
|
|
|
|
if val_user_item_feats_df is not None:
|
|
|
|
|
val_user_item_feats_df['is_trainee_hab'] = val_user_item_feats_df.progress_apply(
|
|
|
|
|
lambda x: fill_is_trainee_hab(x), axis=1)
|
|
|
|
|
else:
|
|
|
|
|
val_user_item_feats_df = None
|
|
|
|
|
|
|
|
|
|
test_user_item_feats_df['is_trainee_hab'] = test_user_item_feats_df.progress_apply(
|
|
|
|
|
lambda x: fill_is_trainee_hab(x), axis=1)
|
|
|
|
|
|
|
|
|
|
# 删除排序模型用不到的特征
|
|
|
|
|
del train_user_item_feats_df['trainee_list']
|
|
|
|
|
|
|
|
|
|
if val_user_item_feats_df is not None:
|
|
|
|
|
del val_user_item_feats_df['trainee_list']
|
|
|
|
|
else:
|
|
|
|
|
val_user_item_feats_df = None
|
|
|
|
|
|
|
|
|
|
del test_user_item_feats_df['trainee_list']
|
|
|
|
|
|
|
|
|
|
logger.info('保存所有特征')
|
|
|
|
|
train_user_item_feats_df.to_csv(shixun_train_user_item_feats, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
if val_user_item_feats_df is not None:
|
|
|
|
|
val_user_item_feats_df.to_csv(shixun_val_user_item_feats, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
test_user_item_feats_df.to_csv(shixun_test_user_item_feats, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
all_user_item_feats_df = train_user_item_feats_df.append(test_user_item_feats_df)
|
|
|
|
|
if val_user_item_feats_df is not None:
|
|
|
|
|
all_user_item_feats_df = all_user_item_feats_df.append(val_user_item_feats_df)
|
|
|
|
|
all_user_item_feats_df.to_csv(shixun_all_user_item_feats, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
build_rank_features_engineering()
|