You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

897 lines
39 KiB

5 months ago
import os
import sys
sys.path.append(os.getcwd())
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import warnings
from utils import reduce_mem
from config import logger
from config import samples_mode
from config import shixun_features_save_path
from config import myshixuns_train_data, myshixuns_test_data
from config import shixun_itemcf_recall_dict
from config import shixun_item_embedding_recall_dict
from config import shixun_youtubednn_recall_dict
from config import shixun_youtubednn_usercf_recall_dict
from config import shixun_dssm_recall_dict
from config import shixun_pinsage_recall_dict
from config import shixun_final_recall_items_dict
# from config import shixuns_bert_emb_dict
from config import shixun_item_w2v_emb_dict
from config import shixun_train_user_item_feats
from config import shixun_val_user_item_feats
from config import shixun_test_user_item_feats
from config import shixun_youtube_user_emb_dict
from config import shixun_youtube_item_emb_dict
from config import shixun_all_user_item_feats
from config import shixun_dssm_item_emb_dict
from config import shixun_dssm_user_emb_dict
from config import offline_mode
from matching.shixun.recall_comm import get_item_info_df
from matching.shixun.recall_comm import get_hist_and_last_select
from matching.shixun.recall_comm import get_user_info_df
from ranking.shixun.rank_comm import fill_is_trainee_hab
from ranking.shixun.rank_comm import get_rank_item_info_dict
# from ranking.shixun.rank_comm import get_item_bert_emb_dict
tqdm.pandas()
warnings.filterwarnings('ignore')
def train_val_split(all_select_df, sample_rate=0.3):
"""
划分训练集和验证集
:param all_select_df: 指的是训练集
:param sample_rate: 采样作为验证集的用户比率
"""
all_select = all_select_df
all_user_ids = all_select.user_id.unique()
# replace = True表示可以重复抽样反之不可以
sample_user_ids = np.random.choice(all_user_ids, size=int(len(all_user_ids) * sample_rate), replace=False)
select_val = all_select[all_select['user_id'].isin(sample_user_ids)]
select_train = all_select[~all_select['user_id'].isin(sample_user_ids)]
# 将验证集中的最后一次选择给抽取出来作为答案
select_val = select_val.sort_values(['user_id', 'created_timestamp'])
val_ans = select_val.groupby('user_id').tail(1)
select_val = select_val.groupby('user_id').progress_apply(lambda x: x[:-1]).reset_index(drop=True)
# 如果该用户只有一个选择数据又被分到ans中
# 止方法保证答案中出现的用户再验证集中还有
val_ans = val_ans[val_ans.user_id.isin(select_val.user_id.unique())]
select_val = select_val[select_val.user_id.isin(val_ans.user_id.unique())]
return select_train, select_val, val_ans
def get_train_val_test_data(offline=True):
"""
读取训练验证测试集
"""
if offline:
select_train_data = pd.read_csv(myshixuns_train_data, sep='\t', encoding='utf-8')
select_train, select_val, val_ans=train_val_split(select_train_data, sample_rate=0.3)
else:
select_train = pd.read_csv(myshixuns_train_data, sep='\t', encoding='utf-8')
select_val = None
val_ans = None
select_test = pd.read_csv(myshixuns_test_data, sep='\t', encoding='utf-8')
return select_train, select_val, select_test, val_ans
def get_recall_list(single_recall_model=None, multi_recall=False):
"""
返回多路召回列表或者部分单路召回列表
"""
if multi_recall:
return pickle.load(open(shixun_final_recall_items_dict, 'rb'))
if single_recall_model == 'i2i_itemcf':
return pickle.load(open(shixun_itemcf_recall_dict, 'rb'))
elif single_recall_model == 'i2i_emb_itemcf':
return pickle.load(open(shixun_item_embedding_recall_dict, 'rb'))
elif single_recall_model == 'user_cf':
return pickle.load(open(shixun_youtubednn_usercf_recall_dict, 'rb'))
elif single_recall_model == 'youtubednn':
return pickle.load(open(shixun_youtubednn_recall_dict, 'rb'))
elif single_recall_model == 'dssm':
return pickle.load(open(shixun_dssm_recall_dict, 'rb'))
elif single_recall_model == 'pinsage':
return pickle.load(open(shixun_pinsage_recall_dict, 'rb'))
def get_embedding():
"""
通过字典查询对应的item的Embedding
"""
#获取shixun的bert embedding字典
# if os.path.exists(shixuns_bert_emb_dict):
# item_bert_emb_dict = pickle.load(open(shixuns_bert_emb_dict, 'rb'))
# else:
# item_bert_emb_dict = get_item_bert_emb_dict()
# w2v Embedding是需要提前训练好的
if os.path.exists(shixun_item_w2v_emb_dict):
item_word2vec_emb_dict = pickle.load(open(shixun_item_w2v_emb_dict, 'rb'))
else:
print(os.path.basename(shixun_item_w2v_emb_dict) + ' file not exist.')
if os.path.exists(shixun_youtube_item_emb_dict):
item_youtube_emb_dict = pickle.load(open(shixun_youtube_item_emb_dict, 'rb'))
else:
print(os.path.basename(shixun_youtube_item_emb_dict) + 'file not exist.')
if os.path.exists(shixun_youtube_user_emb_dict):
user_youtube_emb_dict = pickle.load(open(shixun_youtube_user_emb_dict, 'rb'))
else:
print(os.path.basename(shixun_youtube_user_emb_dict) + 'file not exist.')
if os.path.exists(shixun_dssm_item_emb_dict):
item_dssm_emb_dict = pickle.load(open(shixun_dssm_item_emb_dict, 'rb'))
else:
print(os.path.basename(shixun_dssm_item_emb_dict) + 'file not exist.')
if os.path.exists(shixun_dssm_user_emb_dict):
user_dssm_emb_dict = pickle.load(open(shixun_dssm_user_emb_dict, 'rb'))
else:
print(os.path.basename(shixun_dssm_user_emb_dict) + 'file not exist.')
return item_word2vec_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict,item_dssm_emb_dict,user_dssm_emb_dict
def recall_dict_2_df(recall_list_dict):
"""
召回列表转换成DataFrame形式, [user, item, score]
"""
df_row_list = []
for user, recall_list in tqdm(recall_list_dict.items()):
for item, score in recall_list:
df_row_list.append([user, item, score])
col_names = ['user_id', 'sim_item', 'score']
recall_list_df = pd.DataFrame(df_row_list, columns=col_names)
return recall_list_df
def neg_sample_recall_data(recall_items_df, sample_rate=0.05):
"""
负采样函数可以控制负采样时的比例, 这里给了一个默认的值
"""
logger.info('采样之前数据')
pos_data = recall_items_df[recall_items_df['label'] == 1]
neg_data = recall_items_df[recall_items_df['label'] == 0]
print('正样本数量:', len(pos_data), '负样本数量:', len(neg_data),
'正样本比率:', round(len(pos_data)/(len(pos_data) + len(neg_data)), 6))
# 分组采样函数
def neg_sample_func(group_df):
neg_num = len(group_df)
# 保证最少有一个
sample_num = max(int(neg_num * sample_rate), 1)
# 保证最多不超过20个这里可以根据实际情况进行选择
sample_num = min(sample_num, 20)
return group_df.sample(n=sample_num, replace=True)
# 对用户进行负采样,保证所有用户都在采样后的数据中
neg_data_user_sample = neg_data.groupby('user_id', group_keys=False). \
progress_apply(neg_sample_func)
# 对物品进行负采样,保证所有物品都在采样后的数据中
neg_data_item_sample = neg_data.groupby('sim_item', group_keys=False). \
progress_apply(neg_sample_func)
# 将上述两种情况下的采样数据合并
neg_data_new = neg_data_user_sample.append(neg_data_item_sample)
# 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重
neg_data_new = neg_data_new.sort_values(['user_id', 'score']).drop_duplicates(
['user_id', 'sim_item'], keep='last')
# 将正样本数据合并
data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)
logger.info('采样之后数据')
pos_data = data_new[data_new['label'] == 1]
neg_data = data_new[data_new['label'] == 0]
print('正样本数量:', len(pos_data), '负样本数量:', len(neg_data),
'正样本比率:', round(len(pos_data)/(len(pos_data) + len(neg_data)), 4))
return data_new
def sample_test_recall_data(recall_items_df, sample_rate=0.05):
"""
测试样采样函数可以控制采样的比例, 这里给了一个默认的值
"""
logger.info('采样之前样本数量:' + str(len(recall_items_df)))
# 分组采样函数
def neg_sample_func(group_df):
neg_num = len(group_df)
# 保证最少有一个
sample_num = max(int(neg_num * sample_rate), 1)
# 保证最多不超过20个这里可以根据实际情况进行选择
sample_num = min(sample_num, 20)
return group_df.sample(n=sample_num, replace=True)
# 对用户进行负采样,保证所有用户都在采样后的数据中
data_user_sample = recall_items_df.groupby('user_id', group_keys=False). \
progress_apply(neg_sample_func)
# 对物品进行负采样,保证所有物品都在采样后的数据中
data_item_sample = recall_items_df.groupby('sim_item', group_keys=False). \
progress_apply(neg_sample_func)
# 将上述两种情况下的采样数据合并
data_new = data_user_sample.append(data_item_sample)
# 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重
data_new = data_new.sort_values(['user_id', 'score']).drop_duplicates(
['user_id', 'sim_item'], keep='last')
logger.info('采样之后样本数量:' + str(len(data_new)))
return data_new
def get_rank_label_df(recall_list_df, label_df, is_test=False):
"""
召回数据打标签
"""
# 测试集是没有标签了,为了后面代码统一一些,这里直接给一个负数替代
if is_test:
recall_list_df['label'] = -1
return recall_list_df
label_df = label_df.rename(columns={'shixun_id': 'sim_item'})
recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'sim_item', 'created_timestamp']], \
how='left', on=['user_id', 'sim_item'])
recall_list_df_['label'] = recall_list_df_['created_timestamp'].progress_apply(
lambda x: 0 if np.isnan(x) else 1)
del recall_list_df_['created_timestamp']
return recall_list_df_
def get_user_recall_item_label_df(select_train_hist,
select_val_hist,
select_test_hist,
select_train_last,
select_val_last,
recall_list_df):
"""
获取用户召回列表训练验证测试集的标签
"""
# 获取训练数据的召回列表
train_user_items_df = recall_list_df[recall_list_df['user_id'].isin(select_train_hist['user_id'].unique())]
logger.info('训练集数据打标签')
train_user_item_label_df = get_rank_label_df(train_user_items_df, select_train_last, is_test=False)
logger.info('训练集数据负采样')
train_user_item_label_df = neg_sample_recall_data(train_user_item_label_df)
if select_val_hist is not None:
val_user_items_df = recall_list_df[recall_list_df['user_id'].isin(select_val_hist['user_id'].unique())]
logger.info('验证集数据打标签')
val_user_item_label_df = get_rank_label_df(val_user_items_df, select_val_last, is_test=False)
logger.info('验证集数据负采样')
val_user_item_label_df = neg_sample_recall_data(val_user_item_label_df)
else:
val_user_item_label_df = None
# 测试集数据进行随机采样,减少生成特征的时间
test_user_items_df = recall_list_df[recall_list_df['user_id'].isin(select_test_hist['user_id'].unique())]
logger.info('测试集数据打标签')
test_user_item_label_df = get_rank_label_df(test_user_items_df, None, is_test=True)
logger.info('测试集数据随机采样')
test_user_item_label_df = sample_test_recall_data(test_user_item_label_df)
return train_user_item_label_df, val_user_item_label_df, test_user_item_label_df
def make_tuple_func(group_df):
"""
将最终的召回的df数据转换成字典的形式做排序特征
"""
row_data = []
for name, row_df in group_df.iterrows():
row_data.append((row_df['sim_item'], row_df['score'], row_df['label']))
return row_data
def get_cos_similar_matrix(v1, v2):
#获取两个向量的余弦相似度
num = np.dot(v1, v2) # 向量点乘
denom = np.linalg.norm(v1).reshape(-1) * np.linalg.norm(v2).reshape(-1) # 求模长的乘积
res = num / denom
res[np.isneginf(res)] = 0.0 #负无穷大的赋值0
# return num
return float(0.5 + 0.5 * res)
def create_behavior_feature(users_id,
recall_list,
select_hist_df,
shixuns_info,
shixun_info_dict,
shixuns_emb,
user_emb=None,
N=1):
"""
基于用户历史行为生成相关特征
:param users_id: 用户id
:param recall_list: 对于每个用户召回的候选物品列表
:param select_hist_df: 用户历史选择的物品
:param shixuns_info: 物品信息
:param shixuns_emb: 物品的embedding向量,可以用item_bert_emb, item_w2v_emb, youtube_item_emb,dssm_item_emb
:param user_emb: 用户的embedding向量,可以是youtube_user_emb, 也可以不用,
如果要传的话shixuns_emb就要用youtube_item_emb,保持维度一样
:param N: 最近的N次选择,由于行为日志里面很多用户只存在一次历史选择,为了不产生空值,默认为1
"""
shixuns_info['shixun_id'] = shixuns_info['shixun_id'].astype(int)
select_hist_df['user_id'] = select_hist_df['user_id'].astype(int)
# 建立一个二维列表保存结果, 后面要转成DataFrame
all_user_feas = []
shixun_id_list = shixuns_info['shixun_id'].values.tolist()
for user_id in tqdm(users_id):
# 该用户的最后N次选择
hist_user_items = select_hist_df[select_hist_df['user_id']==user_id]['shixun_id'][-N:]
# 遍历该用户的召回列表
for rank, (shixun_id, score, label) in enumerate(recall_list[user_id]):
# 不在物品信息中的跳过,以免报错
if shixun_id not in shixun_id_list:
continue
shixun_id = int(shixun_id)
cur_shixuns_info = shixun_info_dict[shixun_id]
# 实训建立时间, 访问次数,选择人数,关卡数量, 难易程度,平均星数,通过人数
a_create_time = cur_shixuns_info[0][0]
a_trainee = cur_shixuns_info[0][1]
a_visits_count = cur_shixuns_info[0][2]
a_myshixuns_count = cur_shixuns_info[0][3]
a_challenges_count = cur_shixuns_info[0][4]
a_averge_star = cur_shixuns_info[0][5]
a_task_pass = cur_shixuns_info[0][6]
single_user_fea = [user_id, shixun_id]
# 计算与最后选择的物品的相似度的和,最大值、最小值、均值
sim_fea = []
time_fea = []
visits_fea = []
myshixuns_fea = []
challenges_fea = []
trainee_fea = []
averge_star_fea = []
task_pass_fea = []
# 遍历用户的最后N次选择物品
for hist_item in hist_user_items:
if (hist_item not in shixun_id_list):
continue
hist_item = int(hist_item)
hist_shixuns_info = shixun_info_dict[hist_item]
b_create_time = hist_shixuns_info[0][0]
b_trainee = hist_shixuns_info[0][1]
b_visits_count = hist_shixuns_info[0][2]
b_myshixuns_count = hist_shixuns_info[0][3]
b_challenges_count = hist_shixuns_info[0][4]
b_averge_star = hist_shixuns_info[0][5]
b_task_pass = hist_shixuns_info[0][6]
if (hist_item not in shixuns_emb) or (shixun_id not in shixuns_emb):
sim_fea.append(0.0)
else:
# sim_fea.append(np.dot(shixuns_emb[hist_item], shixuns_emb[shixun_id]))
sim_fea.append(get_cos_similar_matrix(shixuns_emb[hist_item], shixuns_emb[shixun_id]))#余弦相似度
time_fea.append(abs(a_create_time - b_create_time))
visits_fea.append(abs(a_visits_count - b_visits_count))
myshixuns_fea.append(abs(a_myshixuns_count - b_myshixuns_count))
challenges_fea.append(abs(a_challenges_count - b_challenges_count))
trainee_fea.append(abs(a_trainee - b_trainee))
averge_star_fea.append(abs(a_averge_star - b_averge_star))
task_pass_fea.append(abs(a_task_pass-b_task_pass))
if (len(sim_fea) != 0) and (len(time_fea) != 0) and (len(visits_fea) != 0) and \
(len(myshixuns_fea) != 0) and (len(challenges_fea) != 0) and \
(len(trainee_fea) != 0) and (len(averge_star_fea) != 0)and (len(task_pass_fea) != 0):
# 相似性特征
single_user_fea.extend(sim_fea)
# 时间差特征
single_user_fea.extend(time_fea)
# 访问次数差特征
single_user_fea.extend(visits_fea)
# 选课人数差特征
single_user_fea.extend(myshixuns_fea)
# 关卡数量差特征
single_user_fea.extend(challenges_fea)
# 难易程度差特征
single_user_fea.extend(trainee_fea)
# 平均星数差特征
single_user_fea.extend(averge_star_fea)
#通过人数差特征
single_user_fea.extend(task_pass_fea)
# 相似性的统计特征
single_user_fea.extend([max(sim_fea), min(sim_fea), sum(sim_fea), sum(sim_fea) / len(sim_fea)])
if user_emb: # 如果用户向量有的话,这里计算该召回物品与用户的相似性特征
if (user_id not in user_emb) or (shixun_id not in shixuns_emb):
single_user_fea.append(0.0)
else:
single_user_fea.append(np.dot(user_emb[user_id], shixuns_emb[shixun_id]))
single_user_fea.extend([score, rank, label])
# 加入到总的表中
all_user_feas.append(single_user_fea)
# 定义交叉特征
id_cols = ['user_id', 'shixun_id']
sim_cols = ['sim' + str(i) for i in range(N)]
time_cols = ['time_diff' + str(i) for i in range(N)]
vists_cols = ['visit_diff' + str(i) for i in range(N)]
myshixuns_cols = ['myshixuns_diff' + str(i) for i in range(N)]
challenge_cols = ['challenges_diff' + str(i) for i in range(N)]
trainee_cols = ['trainee_diff' + str(i) for i in range(N)]
averge_star_cols = ['averge_star_diff' + str(i) for i in range(N)]
task_pass_cols = ['task_pass_diff' + str(i) for i in range(N)]
sat_cols = ['sim_max', 'sim_min', 'sim_sum', 'sim_mean']
user_item_sim_cols = ['user_item_sim'] if user_emb else []
user_score_rank_label = ['score', 'rank', 'label']
# 交叉特征列表
cols = id_cols + sim_cols + time_cols + vists_cols + myshixuns_cols + challenge_cols \
+ trainee_cols + averge_star_cols + task_pass_cols + sat_cols + user_item_sim_cols + user_score_rank_label
# 转成DataFrame
features_df = pd.DataFrame(all_user_feas, columns=cols)
return features_df
def active_level(all_data, cols):
"""
生成用户活跃度的特征
根据用户选择物品时间和选择物品的次数生成用户活跃度
如果用户选择物品之间的时间间隔比较小同时选择的物品次数很多就认为此用户是活跃用户
1. 首先根据user_id分组, 对于每个用户计算选择物品的次数两两选择物品时间间隔的均值
2. 把选择次数取倒数和时间间隔的均值统一归一化然后两者相加合并该值越小说明用户越活跃
3. 注意上面两两选择物品的时间间隔均值会出现如果用户只选择了一次的情况
这时候时间间隔均值那里会出现空值对于这种情况最后特征那里给个大数进行区分
"""
if os.path.exists(shixun_features_save_path + 'user_active_level.csv'):
user_act = pd.read_csv(shixun_features_save_path + 'user_active_level.csv', sep='\t', encoding='utf-8')
return user_act
data = all_data[cols]
data.sort_values(['user_id', 'created_timestamp'], inplace=True)
user_act = pd.DataFrame(data.groupby('user_id', as_index=False)[['shixun_id', 'created_timestamp']].\
agg({'shixun_id':np.size, 'created_timestamp': {list}}).values, \
columns=['user_id', 'select_size', 'created_timestamp'])
# 计算时间间隔的均值
def time_diff_mean(l):
if len(l) == 1:
return 1
else:
return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])
user_act['user_time_diff_mean'] = user_act['created_timestamp'].progress_apply(lambda x: time_diff_mean(x))
# 选择次数取倒数
user_act['select_size'] = 1 / user_act['select_size']
# 两者归一化
user_act['select_size'] = (user_act['select_size'] - user_act['select_size'].min()) / \
(user_act['select_size'].max() - user_act['select_size'].min())
user_act['user_time_diff_mean'] = (user_act['user_time_diff_mean'] - user_act['user_time_diff_mean'].min()) / \
(user_act['user_time_diff_mean'].max() - user_act['user_time_diff_mean'].min())
user_act['active_level'] = user_act['select_size'] + user_act['user_time_diff_mean']
user_act['user_id'] = user_act['user_id'].astype('int')
del user_act['created_timestamp']
user_act.to_csv(shixun_features_save_path + 'user_active_level.csv', index=False, header=True, sep='\t')
return user_act
def hot_level(all_data, cols):
"""
生成物品热度的特征
根据物品选择时间和被选择物品的次数来衡量物品热度特征
如果物品在很短的时间间隔之内被选择了很多次说明物品比较热门
1. 根据物品进行分组对于每个物品的用户计算选择的时间间隔
2. 将用户的数量取倒数然后用户的数量和时间间隔归一化相加得到热度特征
该值越小说明被选择的次数越大且时间间隔越短物品比较热门
"""
if os.path.exists(shixun_features_save_path + 'shixun_hot_level.csv'):
shixun_hot = pd.read_csv(shixun_features_save_path + 'shixun_hot_level.csv', sep='\t', encoding='utf-8')
return shixun_hot
data = all_data[cols]
data.sort_values(['shixun_id', 'created_timestamp'], inplace=True)
shixun_hot = pd.DataFrame(data.groupby('shixun_id', as_index=False) \
[['user_id', 'created_timestamp']]. \
agg({'user_id': np.size, 'created_timestamp': {list}}).values, \
columns=['shixun_id', 'user_num', 'created_timestamp'])
# 计算被选择时间间隔的均值
def time_diff_mean(l):
if len(l) == 1:
return 1
else:
return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])
shixun_hot['item_time_diff_mean'] = shixun_hot['created_timestamp']. \
progress_apply(lambda x: time_diff_mean(x))
# 选择次数取倒数
shixun_hot['user_num'] = 1 / shixun_hot['user_num']
# 两者归一化
shixun_hot['user_num'] = (shixun_hot['user_num'] - shixun_hot['user_num'].min()) /\
(shixun_hot['user_num'].max() - shixun_hot['user_num'].min())
shixun_hot['item_time_diff_mean'] = (shixun_hot['item_time_diff_mean'] - shixun_hot['item_time_diff_mean'].min()) /\
(shixun_hot['item_time_diff_mean'].max() - shixun_hot['item_time_diff_mean'].min())
shixun_hot['hot_level'] = shixun_hot['user_num'] + shixun_hot['item_time_diff_mean']
shixun_hot['shixun_id'] = shixun_hot['shixun_id'].astype('int')
del shixun_hot['created_timestamp']
shixun_hot.to_csv(shixun_features_save_path + 'shixun_hot_level.csv', index=False, header=True, sep='\t')
return shixun_hot
def user_time_hob_fea(all_data, cols):
"""
生成用户的时间习惯特征
根据用户选择的历史物品的时间做统计求均值
可以看出用户习惯一天什么时候选择物品
"""
user_time_hob_info = all_data[cols]
# 先把时间戳进行归一化
mm = MinMaxScaler()
user_time_hob_info['created_timestamp'] = mm.fit_transform(user_time_hob_info[['created_timestamp']])
user_time_hob_info['created_at_ts'] = mm.fit_transform(user_time_hob_info[['created_at_ts']])
user_time_hob_info = user_time_hob_info.groupby('user_id').agg('mean').reset_index()
user_time_hob_info.rename(columns={'created_timestamp': 'user_time_hob1',
'created_at_ts': 'user_time_hob2'}, inplace=True)
return user_time_hob_info
def user_trainee_hob_fea(all_data, cols):
"""
用户的选择的实训难易爱好
根据用户选择的实训难易度转成一个列表
后面汇总的时候再单独制作一个特征如果难度在这里面为1否则为0
"""
user_category_hob_info = all_data[cols]
user_category_hob_info['trainee'] = user_category_hob_info['trainee'].astype(str)
user_category_hob_info = user_category_hob_info.groupby('user_id').agg({set}).reset_index()
user_trainee_hob_info = pd.DataFrame()
user_trainee_hob_info['user_id'] = user_category_hob_info['user_id']
user_trainee_hob_info['trainee_list'] = user_category_hob_info['trainee']
return user_trainee_hob_info
def build_rank_features_engineering():
"""
排序模型特征工程
"""
logger.info('获取训练验证测试数据集')
# offline和online的区别就是验证集是否为空
# online时select_val, val_ans为空
select_train, select_val, select_test, val_ans = get_train_val_test_data(offline=offline_mode)
logger.info('获取用户历史选择和最后一次选择')
select_train_hist, select_train_last = get_hist_and_last_select(select_train)
if select_val is not None:
select_val_hist, select_val_last = select_val, val_ans
else:
select_val_hist, select_val_last = None, None
select_test_hist = select_test
# 读取离线召回数据
# 全量数据时只选择pinsage召回的结果
# 增量数据时选择多路召回合并的结果
logger.info('获取召回列表数据')
recall_list_dict = get_recall_list(single_recall_model='pinsage', multi_recall=samples_mode)
logger.info('召回数据转换成DataFrame...')
recall_list_df = recall_dict_2_df(recall_list_dict)
logger.info('给训练验证测试数据集打标签,负采样...')
train_user_item_label_df, val_user_item_label_df, test_user_item_label_df=\
get_user_recall_item_label_df(select_train_hist,
select_val_hist,
select_test_hist,
select_train_last,
select_val_last,
recall_list_df)
logger.info('召回数据转换成字典')
train_user_item_label_tuples = train_user_item_label_df.groupby('user_id'). \
progress_apply(make_tuple_func).reset_index()
train_user_item_label_tuples_dict = dict(zip(train_user_item_label_tuples['user_id'],
train_user_item_label_tuples[0]))
if val_user_item_label_df is not None:
val_user_item_label_tuples = val_user_item_label_df.groupby('user_id'). \
progress_apply(make_tuple_func).reset_index()
val_user_item_label_tuples_dict = dict(zip(val_user_item_label_tuples['user_id'],
val_user_item_label_tuples[0]))
else:
val_user_item_label_tuples_dict = None
test_user_item_label_tuples = test_user_item_label_df.groupby('user_id'). \
progress_apply(make_tuple_func).reset_index()
test_user_item_label_tuples_dict = dict(zip(test_user_item_label_tuples['user_id'],
test_user_item_label_tuples[0]))
logger.info("获取用户信息")
users_info = get_user_info_df()
# 用到的用户信息特征
users_info = users_info[['user_id', 'gender', 'school_id',
'identity','edu_background', 'logins', 'grade', 'experience']]
logger.info('获取物品信息')
shixun_info_df = get_item_info_df()
# 用到的物品信息特征
shixun_info_df = shixun_info_df[['shixun_id', 'visits', 'trainee',
'myshixuns_count', 'challenges_count', 'averge_star','task_pass', 'created_at_ts']]
logger.info('生成物品信息字典')
shixun_info_dict = get_rank_item_info_dict(shixun_info_df)
logger.info('获取物品向量化特征')
item_word2vec_emb_dict, item_youtube_emb_dict,\
user_youtube_emb_dict,item_dssm_emb_dict,user_dssm_emb_dict = get_embedding()
if os.path.exists(shixun_features_save_path + 'train_user_item_behavior_feats_df.csv'):
train_user_item_feats_df = pd.read_csv(shixun_features_save_path
+ 'train_user_item_behavior_feats_df.csv', sep='\t', encoding='utf-8')
reduce_mem(train_user_item_feats_df)
else:
logger.info('生成训练数据集中物品交叉特征')
train_user_item_feats_df = create_behavior_feature(train_user_item_label_tuples_dict.keys(),
train_user_item_label_tuples_dict,
select_train_hist,
shixun_info_df,
shixun_info_dict,
item_word2vec_emb_dict)
train_user_item_feats_df.to_csv(shixun_features_save_path
+ 'train_user_item_behavior_feats_df.csv', sep='\t', index=False, header=True)
reduce_mem(train_user_item_feats_df)
if os.path.exists(shixun_features_save_path + 'val_user_item_behavior_feats_df.csv'):
val_user_item_feats_df = pd.read_csv(shixun_features_save_path
+ 'val_user_item_behavior_feats_df.csv', sep='\t', encoding='utf-8')
reduce_mem(val_user_item_feats_df)
else:
if val_user_item_label_tuples_dict is not None:
logger.info('生成验证数据集中物品交叉特征')
val_user_item_feats_df = create_behavior_feature(val_user_item_label_tuples_dict.keys(),
val_user_item_label_tuples_dict,
select_val_hist,
shixun_info_df,
shixun_info_dict,
item_word2vec_emb_dict)
else:
val_user_item_feats_df = None
if val_user_item_feats_df is not None:
val_user_item_feats_df.to_csv(shixun_features_save_path
+ 'val_user_item_behavior_feats_df.csv', sep='\t', index=False, header=True)
reduce_mem(val_user_item_feats_df)
if os.path.exists(shixun_features_save_path + 'test_user_item_behavior_feats_df.csv'):
test_user_item_feats_df = pd.read_csv(shixun_features_save_path
+ 'test_user_item_behavior_feats_df.csv', sep='\t', encoding='utf-8')
reduce_mem(test_user_item_feats_df)
else:
logger.info('生成测试数据集中物品交叉特征')
test_user_item_feats_df = create_behavior_feature(test_user_item_label_tuples_dict.keys(),
test_user_item_label_tuples_dict,
select_test_hist,
shixun_info_df,
shixun_info_dict,
item_word2vec_emb_dict)
test_user_item_feats_df.to_csv(shixun_features_save_path
+ 'test_user_item_behavior_feats_df.csv', sep='\t', index=False, header=True)
reduce_mem(test_user_item_feats_df)
# 物品行为数据,就是前面的所有数据
if select_val is not None:
all_data = select_train.append(select_val)
all_data = select_train.append(select_test)
# 拼上物品信息
all_data = all_data.merge(shixun_info_df, on='shixun_id', how='left')
logger.info('生成用户活跃度特征')
user_act_fea = active_level(all_data, ['user_id', 'shixun_id', 'created_timestamp'])
logger.info('生成物品热度特征')
shixun_hot_fea = hot_level(all_data, ['user_id', 'shixun_id', 'created_timestamp'])
# 用户时间特征
user_time_hob_cols = ['user_id', 'created_timestamp', 'created_at_ts']
user_time_hob_info = user_time_hob_fea(all_data, user_time_hob_cols)
# 用户选择的实训难度特征
user_category_hob_cols = ['user_id', 'trainee']
user_trainee_hob_info = user_trainee_hob_fea(all_data, user_category_hob_cols)
# 用户选择的实训访问次数特征
user_visits_count_info = all_data.groupby('user_id')['visits'].agg('mean').reset_index()
user_visits_count_info.rename(columns={'visits': 'visits_hbo'}, inplace=True)
# 用户选择的实训选用次数特征
user_myshixuns_count_info = all_data.groupby('user_id')['myshixuns_count'].agg('mean').reset_index()
user_myshixuns_count_info.rename(columns={'myshixuns_count': 'myshixuns_hbo'}, inplace=True)
# 用户选择的实训关卡数量特征
user_challenges_count_info = all_data.groupby('user_id')['challenges_count'].agg('mean').reset_index()
user_challenges_count_info.rename(columns={'challenges_count': 'challenges_hbo'}, inplace=True)
# 用户选择的实训评价星数特征
user_averge_star_info = all_data.groupby('user_id')['averge_star'].agg('mean').reset_index()
user_averge_star_info.rename(columns={'averge_star': 'averge_star_hbo'}, inplace=True)
# 用户选择的实训通过人数特征
user_task_pass_info = all_data.groupby('user_id')['task_pass'].agg('mean').reset_index()
user_task_pass_info.rename(columns={'task_pass': 'task_pass_hbo'}, inplace=True)
# 用户选择的实训数量特征
user_shixun_num_info = all_data.groupby('user_id')['shixun_id'].agg([('shixun_num', 'count')]).reset_index()
user_shixun_num_info.rename(columns={'shixun_num': 'seq_length'}, inplace=True)
logger.info('合并用户特征')
user_features = pd.merge(user_act_fea, user_time_hob_info, on='user_id')
user_features = user_features.merge(user_trainee_hob_info, on='user_id')
user_features = user_features.merge(user_visits_count_info, on='user_id')
user_features = user_features.merge(user_myshixuns_count_info, on='user_id')
user_features = user_features.merge(user_challenges_count_info, on='user_id')
user_features = user_features.merge(user_averge_star_info, on='user_id')
user_features = user_features.merge(user_task_pass_info, on='user_id')
user_features = user_features.merge(user_shixun_num_info, on='user_id')
# 合并用户人口学统计特征
user_features = user_features.merge(users_info, on='user_id', how='left')
logger.info('保存用户特征')
user_features.to_csv(shixun_features_save_path + 'user_features_df.csv',
sep='\t', header=True, index=False)
logger.info('拼接用户特征')
train_user_item_feats_df = train_user_item_feats_df.merge(user_features, on='user_id', how='left')
if val_user_item_feats_df is not None:
val_user_item_feats_df = val_user_item_feats_df.merge(user_features, on='user_id', how='left')
else:
val_user_item_feats_df = None
test_user_item_feats_df = test_user_item_feats_df.merge(user_features, on='user_id',how='left')
logger.info('拼接物品特征')
train_user_item_feats_df = train_user_item_feats_df.merge(shixun_info_df, on='shixun_id', how='left')
train_user_item_feats_df = train_user_item_feats_df.merge(shixun_hot_fea, on='shixun_id', how='left')
if val_user_item_feats_df is not None:
val_user_item_feats_df = val_user_item_feats_df.merge(shixun_info_df, on='shixun_id', how='left')
val_user_item_feats_df = val_user_item_feats_df.merge(shixun_hot_fea, on='shixun_id', how='left')
else:
val_user_item_feats_df = None
test_user_item_feats_df = test_user_item_feats_df.merge(shixun_info_df, on='shixun_id', how='left')
test_user_item_feats_df = test_user_item_feats_df.merge(shixun_hot_fea, on='shixun_id', how='left')
# 是否在用户选择的实训难度中
train_user_item_feats_df['is_trainee_hab'] = train_user_item_feats_df.progress_apply(
lambda x: fill_is_trainee_hab(x), axis=1)
if val_user_item_feats_df is not None:
val_user_item_feats_df['is_trainee_hab'] = val_user_item_feats_df.progress_apply(
lambda x: fill_is_trainee_hab(x), axis=1)
else:
val_user_item_feats_df = None
test_user_item_feats_df['is_trainee_hab'] = test_user_item_feats_df.progress_apply(
lambda x: fill_is_trainee_hab(x), axis=1)
# 删除排序模型用不到的特征
del train_user_item_feats_df['trainee_list']
if val_user_item_feats_df is not None:
del val_user_item_feats_df['trainee_list']
else:
val_user_item_feats_df = None
del test_user_item_feats_df['trainee_list']
logger.info('保存所有特征')
train_user_item_feats_df.to_csv(shixun_train_user_item_feats, sep='\t', index=False, header=True)
if val_user_item_feats_df is not None:
val_user_item_feats_df.to_csv(shixun_val_user_item_feats, sep='\t', index=False, header=True)
test_user_item_feats_df.to_csv(shixun_test_user_item_feats, sep='\t', index=False, header=True)
all_user_item_feats_df = train_user_item_feats_df.append(test_user_item_feats_df)
if val_user_item_feats_df is not None:
all_user_item_feats_df = all_user_item_feats_df.append(val_user_item_feats_df)
all_user_item_feats_df.to_csv(shixun_all_user_item_feats, sep='\t', index=False, header=True)
if __name__ == '__main__':
build_rank_features_engineering()