|
|
import pandas as pd
|
|
|
import os
|
|
|
import sys
|
|
|
sys.path.append(os.getcwd())
|
|
|
from config import study_path
|
|
|
from config import logger
|
|
|
from config import study_path_data
|
|
|
from tqdm import tqdm
|
|
|
from config import mysql_test_database, mysql_test_passwd
|
|
|
from config import mysql_test_host, mysql_test_port, mysql_test_user
|
|
|
from sqlalchemy import create_engine
|
|
|
import datetime
|
|
|
from utils import get_before_date
|
|
|
from urllib.parse import quote
|
|
|
|
|
|
tqdm.pandas()
|
|
|
|
|
|
def get_attachments_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取附件的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取附件数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/attachments.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['attachment_id'], inplace=True)
|
|
|
data_df['created_on'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的文件名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['filename'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["created_on"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'attachments.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("附件数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_challenge_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取挑战的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取挑战数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/challenges.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['challenge_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的挑战名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['challenge_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['task_pass'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['challenge_tag'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'challenges.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("挑战数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_course_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取教学课堂的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取教学课堂数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/courses.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['course_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的课堂名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['course_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'courses.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("教学课堂数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_exercise_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取考试的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取考试数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/exercises.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['exercise_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的考试名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['exercise_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['exercise_description'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'exercises.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("考试数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_homework_shixun_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取实训作业的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取实训作业数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/homework_shixun.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['shixun_id'], inplace=True)
|
|
|
data_df.dropna(axis=0, subset=['homework_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'homework_shixun.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("实训作业数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_homework_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取作业的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取作业数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/homeworks.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['homework_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的作业名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['homework_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'homeworks.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("作业数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_stage_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取章节的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取章节数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/stages.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['stage_id'], inplace=True)
|
|
|
data_df.dropna(axis=0, subset=['subject_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的章节名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['stage_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'stages.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("章节数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_stage_item_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取章节项目的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取章节项目数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/stage_items.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['item_id'], inplace=True)
|
|
|
data_df.dropna(axis=0, subset=['subject_id'], inplace=True)
|
|
|
data_df.dropna(axis=0, subset=['stage_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'stage_items.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("章节项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_subject_course_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取课堂课程的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取课堂课程数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/subject_course_records.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['subject_id'], inplace=True)
|
|
|
data_df.dropna(axis=0, subset=['course_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'subject_course_records.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("课堂课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_shixuns_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库中获取实践项目的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取实践项目数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/shixuns.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['shixun_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的实训名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['shixun_tag'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['language'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['propaedeutics'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'shixuns.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("实践项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_subjects_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库获取实践课程原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取实践课程数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/subjects.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['subject_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的课程名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['subject_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['learning_notes'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'subjects.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("实践课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_video_item_data_from_mysql(data_time):
|
|
|
"""
|
|
|
从mysql数据库获取实践课程原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取视频项目数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/video_items.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.dropna(axis=0, subset=['video_item_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
|
|
|
# 去除无用的视频项目名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['video_name'].astype(str).str.match(regex_pattern)]
|
|
|
data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df=data_df[data_df["updated_at"]>data_time]
|
|
|
data_df.to_csv(study_path_data + 'video_item.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("视频项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_watch_course_video_from_mysql():
|
|
|
"""
|
|
|
从mysql数据库获取观看教学视频历史数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取观看教学视频数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/watch_course_videos.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
data_df.fillna(0, inplace=True)
|
|
|
data_df.to_csv(study_path_data + 'watch_course_videos.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("观看教学视频数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_games_from_mysql():
|
|
|
"""
|
|
|
从mysql数据库获取学生闯关情况数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取学生闯关情况数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/games.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
data_df.fillna(0., inplace=True)
|
|
|
|
|
|
data_df.to_csv(study_path_data + 'games.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("学生闯关情况数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_student_works_from_mysql():
|
|
|
"""
|
|
|
从mysql数据库获取学生作业情况数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取学生作业情况数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/student_works.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.fillna(0, inplace=True)
|
|
|
|
|
|
data_df.to_csv(study_path_data + 'student_works.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("学生作业情况数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def get_attachment_logs_from_mysql():
|
|
|
"""
|
|
|
从mysql数据库获取学生访问课件情况数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取学生访问课件情况数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@'
|
|
|
+ str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(study_path + 'data_analysis/attachment_logs.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.fillna(0, inplace=True)
|
|
|
|
|
|
data_df.to_csv(study_path_data + 'attachment_logs.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("学生访问课件情况数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
# 获取前N天日期
|
|
|
data_time=get_before_date(365*3)
|
|
|
|
|
|
# 获取所有附件数据
|
|
|
# get_attachments_data_from_mysql(data_time)
|
|
|
|
|
|
# 获取所有挑战数据
|
|
|
# get_challenge_data_from_mysql(data_time)
|
|
|
|
|
|
# 获取所有课堂数据
|
|
|
# get_course_data_from_mysql(data_time)
|
|
|
|
|
|
#获取所有考试数据
|
|
|
# get_exercise_data_from_mysql(data_time)
|
|
|
|
|
|
#获取所有实训作业数据
|
|
|
# get_homework_shixun_data_from_mysql(data_time)
|
|
|
|
|
|
#获取所有作业数据
|
|
|
# get_homework_data_from_mysql(data_time)
|
|
|
|
|
|
#获取所有章节数据
|
|
|
# get_stage_data_from_mysql(data_time)
|
|
|
|
|
|
#获取所有章节项目数据
|
|
|
# get_stage_item_data_from_mysql(data_time)
|
|
|
|
|
|
# 获取所有实践项目数据
|
|
|
# get_shixuns_data_from_mysql(data_time)
|
|
|
|
|
|
# 获取所有实践课程数据
|
|
|
# get_subjects_data_from_mysql(data_time)
|
|
|
|
|
|
#获取所有课堂课程记录数据
|
|
|
# get_subject_course_data_from_mysql(data_time)
|
|
|
|
|
|
#获取所有视频项目数据
|
|
|
# get_video_item_data_from_mysql(data_time)
|
|
|
# get_watch_course_video_from_mysql()
|
|
|
# get_games_from_mysql()
|
|
|
# get_student_works_from_mysql()
|
|
|
|
|
|
|
|
|
get_attachment_logs_from_mysql()
|
|
|
print("数据获取完成!")
|
|
|
|
|
|
|
|
|
|