You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

421 lines
17 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import numpy as np
import time
import pickle
from config import myshixuns_data_path, mysubjects_data_path, RANDOM_SEED
from config import myshixuns_train_data, myshixuns_test_data
from config import mysubjects_train_data, mysubjects_test_data
from config import shixuns_data_path, subjects_data_path
from config import myshixuns_train_data_baseline, myshixuns_test_data_baseline
from config import mysubjects_train_data_baseline, mysubjects_test_data_baseline
from config import logger
from config import shixun_id_to_name_dict_data, subject_id_to_name_dict_data
from config import myshixuns_save_path, mysubjects_save_path, users_data_path
from config import data_parent_path
from tqdm import tqdm
from utils import is_number
from utils import get_before_date
from bs4 import BeautifulSoup
from config import mysql_database, mysql_passwd
from config import mysql_host, mysql_port, mysql_user
from config import root_path
from config import samples_mode
from sqlalchemy import create_engine
import pymysql
import datetime
from urllib.parse import quote
import re
tqdm.pandas()
def build_myshixuns_train_test_data_baseline():
"""
划分学生参加实践项目的baseline训练集和测试集
"""
logger.info("加载物品行为数据")
data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
logger.info("生成itemcf baseline训练集和测试集数据")
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
# 训练集划分90%测试集10%
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(myshixuns_train_data_baseline, sep='\t', index=False, header=True)
test_data.to_csv(myshixuns_test_data_baseline, sep='\t', index=False, header=True)
def build_myshixuns_train_test_data():
"""
划分学生参加实践项目的训练集和测试集
"""
logger.info("加载物品行为数据")
data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
logger.info("生成模型训练集和测试集数据")
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
# 对时间戳进行归一化, 用于在关联规则的时候计算权重
data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler)
# 训练集划分90%测试集10%
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(myshixuns_train_data, sep='\t', index=False, header=True)
test_data.to_csv(myshixuns_test_data, sep='\t', index=False, header=True)
def build_myshixuns_samples_data(sample_nums=10000):
"""
划分学生参加实践项目的采样数据集
"""
all_select = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
# user_id过滤重复
all_user_ids = all_select.user_id.unique()
# 只采集指定数量的user_id
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
# 取出这些user_id选择的物品
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
# 根据user_id, shixun_id去重
all_select = all_select.drop_duplicates((['user_id', 'shixun_id']))
all_select.to_csv(myshixuns_save_path + 'myshixuns_samples.csv', sep='\t', index=False, header=True)
return all_select
def process_shixuns_data():
"""
处理实训数据集
"""
data_df = pd.read_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', encoding='utf-8')
data_df['created_at_ts'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
data_df['shixun_id'] = data_df['shixun_id'].astype(int)
# 去除无用的实训名称
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\?\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)]
# 生成实训ID和实训名称的字典,方便召回时使用
shixun_id_to_name_dict = dict(zip(data_df['shixun_id'], data_df['shixun_name']))
pickle.dump(shixun_id_to_name_dict, open(shixun_id_to_name_dict_data, 'wb'))
data_df.to_csv(shixuns_data_path, sep='\t', index=False, header=True)
def process_subjects_data():
"""
处理课程数据集
"""
data_df = pd.read_csv(data_parent_path + 'subjects_raw.csv', sep='\t', encoding='utf-8')
data_df.dropna(axis=0, subset=['subject_id'], inplace=True)
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
data_df['created_at_ts'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
data_df['subject_id'] = data_df['subject_id'].astype(int)
data_df['disciplines_id'] = data_df['disciplines_id'].fillna(-1)
data_df['disciplines_id'] = data_df['disciplines_id'].astype(int)
# 去除无用的课程名称
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\?\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
data_df = data_df[~data_df['subject_name'].astype(str).str.match(regex_pattern)]
# 生成实训ID和实训名称的字典,方便召回时使用
subject_id_to_name_dict = dict(zip(data_df['subject_id'], data_df['subject_name']))
pickle.dump(subject_id_to_name_dict, open(subject_id_to_name_dict_data, 'wb'))
data_df.to_csv(subjects_data_path, sep='\t', index=False, header=True)
def build_mysubjects_train_test_data_baseline():
"""
划分学生参加实践课程的baseline训练集和测试集
"""
logger.info("生成itemcf baseline训练集和测试集数据")
data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(mysubjects_train_data_baseline, sep='\t', index=False, header=True)
test_data.to_csv(mysubjects_test_data_baseline, sep='\t', index=False, header=True)
def build_mysubjects_train_test_data():
"""
划分学生参加实践课程的训练集和测试集
"""
logger.info("加载物品行为数据")
data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
logger.info("生成模型训练集和测试集数据")
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
# 对时间戳进行归一化, 用于在关联规则的时候计算权重
data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler)
# 训练集划分90%测试集10%
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(mysubjects_train_data, sep='\t', index=False, header=True)
test_data.to_csv(mysubjects_test_data, sep='\t', index=False, header=True)
def build_mysubjects_samples_data(sample_nums=10000):
"""
划分学生参加实践课程的采样数据集
"""
all_select = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
# user_id过滤重复
all_user_ids = all_select.user_id.unique()
# 只采集指定数量的user_id
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
# 取出这些user_id选择的物品
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
# 根据user_id, shixun_id去重
all_select = all_select.drop_duplicates((['user_id', 'subject_id']))
all_select.to_csv(mysubjects_save_path + 'mysubjects_samples.csv', sep='\t', index=False, header=True)
return all_select
def clean_html(x):
return BeautifulSoup(str(x), 'html').get_text().strip().replace("\r\n", "").replace("\r", "").replace("\t", "").replace("\n", "").replace("@", "")
def get_shixuns_data_from_mysql():
"""
从mysql数据库中获取实践项目的原始数据
"""
start = datetime.datetime.now()
logger.info("开始获取实践项目数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/shixuns.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', index=False, header=True)
logger.info("实践项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_cold_start_shixuns_data_from_mysql():
"""
从mysql数据库获取实践项目原始数据
"""
start = datetime.datetime.now()
logger.info("开始获取实践项目冷启动数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/cold_start_shixuns.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
data_df = pd.read_sql_query(sql_text, engine)
# 去除无用的实训名称
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\?\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)]
data_df.to_csv(data_parent_path + 'cold_start_shixuns.csv', sep='\t', index=False, header=True)
logger.info("实践项目冷启动数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_subjects_data_from_mysql():
"""
从mysql数据库获取实践课程原始数据
"""
start = datetime.datetime.now()
logger.info("开始获取实践课程数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/subjects.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_parent_path + 'subjects_raw.csv', sep='\t', index=False, header=True)
logger.info("实践课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_myshixuns_data_from_mysql(last_login_time, last_created_time, data_save_path):
"""
从mysql数据库获取学生选择的实践项目行为数据
"""
start = datetime.datetime.now()
logger.info("开始获取学生选择的实践项目行为数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/myshixuns.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time)
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
logger.info("学生实践项目行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_mysubjects_data_from_mysql(last_login_time, last_created_time, data_save_path):
"""
从mysql数据库获取学生选择的实践课程行为数据
"""
start = datetime.datetime.now()
logger.info("开始获取学生选择的实践课程行为数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/mysubjects.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time)
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
logger.info("学生实践课程行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def process_users_data_null(user_info_df):
"""
填充用户数据缺失值
"""
user_info_df['edu_background'] = user_info_df['edu_background'].fillna(0.0)
user_info_df['edu_entry_year'] = user_info_df['edu_entry_year'].fillna(0.0)
user_info_df['experience'] = user_info_df['experience'].fillna(0.0)
user_info_df['gender'] = user_info_df['gender'].fillna(-1)
user_info_df['gid'] = user_info_df['gender'].fillna(0)
user_info_df['grade'] = user_info_df['grade'].fillna(0.0)
user_info_df['identity'] = user_info_df['identity'].fillna(0)
user_info_df['location'] = user_info_df['location'].fillna('')
user_info_df['location_city'] = user_info_df['location_city'].fillna('')
user_info_df['occupation'] = user_info_df['occupation'].fillna('')
user_info_df['school_id'] = user_info_df['school_id'].fillna(0)
user_info_df['school_name'] = user_info_df['school_name'].fillna('')
user_info_df['user_name'] = user_info_df['user_name'].fillna('')
user_info_df['logins'] = user_info_df['logins'].fillna(0)
return user_info_df
def get_users_data_from_mysql(last_login_time, data_save_path):
"""
从mysql数据库获取学生信息数据
"""
start = datetime.datetime.now()
logger.info("开始获取学生信息数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/users.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
data_df = pd.read_sql_query(sql_text, engine)
data_df = data_df.loc[data_df['user_id'].progress_apply(lambda x: is_number(x))]
data_df = process_users_data_null(data_df)
data_df.rename(columns={'visits': 'logins'}, inplace=True) #修改用户访问名称,与实训和课程区分,方便后期统一操作
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
logger.info("学生信息数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
if __name__ == '__main__':
if samples_mode == True:
# 增量数据为最近1年登录的用户最近1年的选课数据
last_login_time = get_before_date(150)
last_created_time = get_before_date(150)
else:
# 全量数据为最近1年登录的用户的最近2年选课数据
last_login_time = get_before_date(365 )
last_created_time = get_before_date(365 * 2)
# 获取用户数据
get_users_data_from_mysql(last_login_time, users_data_path)
# 获取所有实践项目数据
get_shixuns_data_from_mysql()
get_cold_start_shixuns_data_from_mysql()
# # 获取所有实践项目选课行为数据
get_myshixuns_data_from_mysql(last_login_time, last_created_time, myshixuns_data_path)
# # 处理实践项目选课行为数据
process_shixuns_data()
build_myshixuns_train_test_data_baseline()
build_myshixuns_train_test_data()
# # 获取所有实践课程数据
get_subjects_data_from_mysql()
# # 获取所有实践课程选课行为数据
get_mysubjects_data_from_mysql(last_login_time, last_created_time, mysubjects_data_path)
# # 处理实践课程选课行为数据
process_subjects_data()
build_mysubjects_train_test_data_baseline()
build_mysubjects_train_test_data()