import pandas as pd import numpy as np import time import pickle from config import myshixuns_data_path, mysubjects_data_path, RANDOM_SEED from config import myshixuns_train_data, myshixuns_test_data from config import mysubjects_train_data, mysubjects_test_data from config import shixuns_data_path, subjects_data_path from config import myshixuns_train_data_baseline, myshixuns_test_data_baseline from config import mysubjects_train_data_baseline, mysubjects_test_data_baseline from config import logger from config import shixun_id_to_name_dict_data, subject_id_to_name_dict_data from config import myshixuns_save_path, mysubjects_save_path, users_data_path from config import data_parent_path from tqdm import tqdm from utils import is_number from utils import get_before_date from bs4 import BeautifulSoup from config import mysql_database, mysql_passwd from config import mysql_host, mysql_port, mysql_user from config import root_path from config import samples_mode from sqlalchemy import create_engine import pymysql import datetime from urllib.parse import quote import re tqdm.pandas() def build_myshixuns_train_test_data_baseline(): """ 划分学生参加实践项目的baseline训练集和测试集 """ logger.info("加载物品行为数据") data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8') logger.info("生成itemcf baseline训练集和测试集数据") data_df['created_timestamp'] = data_df['created_at'].progress_apply( lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S'))) # 训练集划分90%,测试集10% train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0) test_data = data_df[~data_df.index.isin(train_data.index)] train_data.to_csv(myshixuns_train_data_baseline, sep='\t', index=False, header=True) test_data.to_csv(myshixuns_test_data_baseline, sep='\t', index=False, header=True) def build_myshixuns_train_test_data(): """ 划分学生参加实践项目的训练集和测试集 """ logger.info("加载物品行为数据") data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8') logger.info("生成模型训练集和测试集数据") data_df['created_timestamp'] = data_df['created_at'].progress_apply( lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S'))) max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x)) # 对时间戳进行归一化, 用于在关联规则的时候计算权重 data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler) # 训练集划分90%,测试集10% train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0) test_data = data_df[~data_df.index.isin(train_data.index)] train_data.to_csv(myshixuns_train_data, sep='\t', index=False, header=True) test_data.to_csv(myshixuns_test_data, sep='\t', index=False, header=True) def build_myshixuns_samples_data(sample_nums=10000): """ 划分学生参加实践项目的采样数据集 """ all_select = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8') # user_id过滤重复 all_user_ids = all_select.user_id.unique() # 只采集指定数量的user_id sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) # 取出这些user_id选择的物品 all_select = all_select[all_select['user_id'].isin(sample_user_ids)] # 根据user_id, shixun_id去重 all_select = all_select.drop_duplicates((['user_id', 'shixun_id'])) all_select.to_csv(myshixuns_save_path + 'myshixuns_samples.csv', sep='\t', index=False, header=True) return all_select def process_shixuns_data(): """ 处理实训数据集 """ data_df = pd.read_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', encoding='utf-8') data_df['created_at_ts'] = data_df['created_at'].progress_apply( lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S'))) data_df['shixun_id'] = data_df['shixun_id'].astype(int) # 去除无用的实训名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)] # 生成实训ID和实训名称的字典,方便召回时使用 shixun_id_to_name_dict = dict(zip(data_df['shixun_id'], data_df['shixun_name'])) pickle.dump(shixun_id_to_name_dict, open(shixun_id_to_name_dict_data, 'wb')) data_df.to_csv(shixuns_data_path, sep='\t', index=False, header=True) def process_subjects_data(): """ 处理课程数据集 """ data_df = pd.read_csv(data_parent_path + 'subjects_raw.csv', sep='\t', encoding='utf-8') data_df.dropna(axis=0, subset=['subject_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['created_at_ts'] = data_df['created_at'].progress_apply( lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S'))) data_df['subject_id'] = data_df['subject_id'].astype(int) data_df['disciplines_id'] = data_df['disciplines_id'].fillna(-1) data_df['disciplines_id'] = data_df['disciplines_id'].astype(int) # 去除无用的课程名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['subject_name'].astype(str).str.match(regex_pattern)] # 生成实训ID和实训名称的字典,方便召回时使用 subject_id_to_name_dict = dict(zip(data_df['subject_id'], data_df['subject_name'])) pickle.dump(subject_id_to_name_dict, open(subject_id_to_name_dict_data, 'wb')) data_df.to_csv(subjects_data_path, sep='\t', index=False, header=True) def build_mysubjects_train_test_data_baseline(): """ 划分学生参加实践课程的baseline训练集和测试集 """ logger.info("生成itemcf baseline训练集和测试集数据") data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8') data_df['created_timestamp'] = data_df['created_at'].progress_apply( lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S'))) train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0) test_data = data_df[~data_df.index.isin(train_data.index)] train_data.to_csv(mysubjects_train_data_baseline, sep='\t', index=False, header=True) test_data.to_csv(mysubjects_test_data_baseline, sep='\t', index=False, header=True) def build_mysubjects_train_test_data(): """ 划分学生参加实践课程的训练集和测试集 """ logger.info("加载物品行为数据") data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8') logger.info("生成模型训练集和测试集数据") data_df['created_timestamp'] = data_df['created_at'].progress_apply( lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S'))) max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x)) # 对时间戳进行归一化, 用于在关联规则的时候计算权重 data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler) # 训练集划分90%,测试集10% train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0) test_data = data_df[~data_df.index.isin(train_data.index)] train_data.to_csv(mysubjects_train_data, sep='\t', index=False, header=True) test_data.to_csv(mysubjects_test_data, sep='\t', index=False, header=True) def build_mysubjects_samples_data(sample_nums=10000): """ 划分学生参加实践课程的采样数据集 """ all_select = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8') # user_id过滤重复 all_user_ids = all_select.user_id.unique() # 只采集指定数量的user_id sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) # 取出这些user_id选择的物品 all_select = all_select[all_select['user_id'].isin(sample_user_ids)] # 根据user_id, shixun_id去重 all_select = all_select.drop_duplicates((['user_id', 'subject_id'])) all_select.to_csv(mysubjects_save_path + 'mysubjects_samples.csv', sep='\t', index=False, header=True) return all_select def clean_html(x): return BeautifulSoup(str(x), 'html').get_text().strip().replace("\r\n", "").replace("\r", "").replace("\t", "").replace("\n", "").replace("@", "") def get_shixuns_data_from_mysql(): """ 从mysql数据库中获取实践项目的原始数据 """ start = datetime.datetime.now() logger.info("开始获取实践项目数据...") engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@' + str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database)) sql_text = '' with open(root_path + '/data_analysis/shixuns.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.to_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', index=False, header=True) logger.info("实践项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_cold_start_shixuns_data_from_mysql(): """ 从mysql数据库获取实践项目原始数据 """ start = datetime.datetime.now() logger.info("开始获取实践项目冷启动数据...") engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@' + str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database)) sql_text = '' with open(root_path + '/data_analysis/cold_start_shixuns.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) # 去除无用的实训名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)] data_df.to_csv(data_parent_path + 'cold_start_shixuns.csv', sep='\t', index=False, header=True) logger.info("实践项目冷启动数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_subjects_data_from_mysql(): """ 从mysql数据库获取实践课程原始数据 """ start = datetime.datetime.now() logger.info("开始获取实践课程数据...") engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@' + str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database)) sql_text = '' with open(root_path + '/data_analysis/subjects.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.to_csv(data_parent_path + 'subjects_raw.csv', sep='\t', index=False, header=True) logger.info("实践课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_myshixuns_data_from_mysql(last_login_time, last_created_time, data_save_path): """ 从mysql数据库获取学生选择的实践项目行为数据 """ start = datetime.datetime.now() logger.info("开始获取学生选择的实践项目行为数据...") engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@' + str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database)) sql_text = '' with open(root_path + '/data_analysis/myshixuns.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time) sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time) data_df = pd.read_sql_query(sql_text, engine) data_df.to_csv(data_save_path, sep='\t', index=False, header=True) logger.info("学生实践项目行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_mysubjects_data_from_mysql(last_login_time, last_created_time, data_save_path): """ 从mysql数据库获取学生选择的实践课程行为数据 """ start = datetime.datetime.now() logger.info("开始获取学生选择的实践课程行为数据...") engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@' + str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database)) sql_text = '' with open(root_path + '/data_analysis/mysubjects.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time) sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time) data_df = pd.read_sql_query(sql_text, engine) data_df.to_csv(data_save_path, sep='\t', index=False, header=True) logger.info("学生实践课程行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def process_users_data_null(user_info_df): """ 填充用户数据缺失值 """ user_info_df['edu_background'] = user_info_df['edu_background'].fillna(0.0) user_info_df['edu_entry_year'] = user_info_df['edu_entry_year'].fillna(0.0) user_info_df['experience'] = user_info_df['experience'].fillna(0.0) user_info_df['gender'] = user_info_df['gender'].fillna(-1) user_info_df['gid'] = user_info_df['gender'].fillna(0) user_info_df['grade'] = user_info_df['grade'].fillna(0.0) user_info_df['identity'] = user_info_df['identity'].fillna(0) user_info_df['location'] = user_info_df['location'].fillna('') user_info_df['location_city'] = user_info_df['location_city'].fillna('') user_info_df['occupation'] = user_info_df['occupation'].fillna('') user_info_df['school_id'] = user_info_df['school_id'].fillna(0) user_info_df['school_name'] = user_info_df['school_name'].fillna('') user_info_df['user_name'] = user_info_df['user_name'].fillna('') user_info_df['logins'] = user_info_df['logins'].fillna(0) return user_info_df def get_users_data_from_mysql(last_login_time, data_save_path): """ 从mysql数据库获取学生信息数据 """ start = datetime.datetime.now() logger.info("开始获取学生信息数据...") engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@' + str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database)) sql_text = '' with open(root_path + '/data_analysis/users.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time) data_df = pd.read_sql_query(sql_text, engine) data_df = data_df.loc[data_df['user_id'].progress_apply(lambda x: is_number(x))] data_df = process_users_data_null(data_df) data_df.rename(columns={'visits': 'logins'}, inplace=True) #修改用户访问名称,与实训和课程区分,方便后期统一操作 data_df.to_csv(data_save_path, sep='\t', index=False, header=True) logger.info("学生信息数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") if __name__ == '__main__': if samples_mode == True: # 增量数据为最近1年登录的用户最近1年的选课数据 last_login_time = get_before_date(150) last_created_time = get_before_date(150) else: # 全量数据为最近1年登录的用户的最近2年选课数据 last_login_time = get_before_date(365 ) last_created_time = get_before_date(365 * 2) # 获取用户数据 get_users_data_from_mysql(last_login_time, users_data_path) # 获取所有实践项目数据 get_shixuns_data_from_mysql() get_cold_start_shixuns_data_from_mysql() # # 获取所有实践项目选课行为数据 get_myshixuns_data_from_mysql(last_login_time, last_created_time, myshixuns_data_path) # # 处理实践项目选课行为数据 process_shixuns_data() build_myshixuns_train_test_data_baseline() build_myshixuns_train_test_data() # # 获取所有实践课程数据 get_subjects_data_from_mysql() # # 获取所有实践课程选课行为数据 get_mysubjects_data_from_mysql(last_login_time, last_created_time, mysubjects_data_path) # # 处理实践课程选课行为数据 process_subjects_data() build_mysubjects_train_test_data_baseline() build_mysubjects_train_test_data()