import pandas as pd import os import sys sys.path.append(os.getcwd()) from config import study_path from config import logger from config import study_path_data from tqdm import tqdm from config import mysql_test_database, mysql_test_passwd from config import mysql_test_host, mysql_test_port, mysql_test_user from sqlalchemy import create_engine import datetime from utils import get_before_date from urllib.parse import quote tqdm.pandas() def get_attachments_data_from_mysql(data_time): """ 从mysql数据库中获取附件的原始数据 """ start = datetime.datetime.now() logger.info("开始获取附件数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/attachments.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['attachment_id'], inplace=True) data_df['created_on'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的文件名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['filename'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["created_on"]>data_time] data_df.to_csv(study_path_data + 'attachments.csv', sep='\t', index=False, header=True) logger.info("附件数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_challenge_data_from_mysql(data_time): """ 从mysql数据库中获取挑战的原始数据 """ start = datetime.datetime.now() logger.info("开始获取挑战数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/challenges.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['challenge_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的挑战名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['challenge_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['task_pass'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['challenge_tag'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'challenges.csv', sep='\t', index=False, header=True) logger.info("挑战数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_course_data_from_mysql(data_time): """ 从mysql数据库中获取教学课堂的原始数据 """ start = datetime.datetime.now() logger.info("开始获取教学课堂数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/courses.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['course_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的课堂名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['course_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'courses.csv', sep='\t', index=False, header=True) logger.info("教学课堂数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_exercise_data_from_mysql(data_time): """ 从mysql数据库中获取考试的原始数据 """ start = datetime.datetime.now() logger.info("开始获取考试数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/exercises.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['exercise_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的考试名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['exercise_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['exercise_description'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'exercises.csv', sep='\t', index=False, header=True) logger.info("考试数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_homework_shixun_data_from_mysql(data_time): """ 从mysql数据库中获取实训作业的原始数据 """ start = datetime.datetime.now() logger.info("开始获取实训作业数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/homework_shixun.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['shixun_id'], inplace=True) data_df.dropna(axis=0, subset=['homework_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'homework_shixun.csv', sep='\t', index=False, header=True) logger.info("实训作业数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_homework_data_from_mysql(data_time): """ 从mysql数据库中获取作业的原始数据 """ start = datetime.datetime.now() logger.info("开始获取作业数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/homeworks.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['homework_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的作业名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['homework_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'homeworks.csv', sep='\t', index=False, header=True) logger.info("作业数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_stage_data_from_mysql(data_time): """ 从mysql数据库中获取章节的原始数据 """ start = datetime.datetime.now() logger.info("开始获取章节数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/stages.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['stage_id'], inplace=True) data_df.dropna(axis=0, subset=['subject_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的章节名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['stage_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'stages.csv', sep='\t', index=False, header=True) logger.info("章节数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_stage_item_data_from_mysql(data_time): """ 从mysql数据库中获取章节项目的原始数据 """ start = datetime.datetime.now() logger.info("开始获取章节项目数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/stage_items.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['item_id'], inplace=True) data_df.dropna(axis=0, subset=['subject_id'], inplace=True) data_df.dropna(axis=0, subset=['stage_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'stage_items.csv', sep='\t', index=False, header=True) logger.info("章节项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_subject_course_data_from_mysql(data_time): """ 从mysql数据库中获取课堂课程的原始数据 """ start = datetime.datetime.now() logger.info("开始获取课堂课程数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/subject_course_records.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['subject_id'], inplace=True) data_df.dropna(axis=0, subset=['course_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'subject_course_records.csv', sep='\t', index=False, header=True) logger.info("课堂课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_shixuns_data_from_mysql(data_time): """ 从mysql数据库中获取实践项目的原始数据 """ start = datetime.datetime.now() logger.info("开始获取实践项目数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/shixuns.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['shixun_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的实训名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['shixun_tag'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['language'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['propaedeutics'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'shixuns.csv', sep='\t', index=False, header=True) logger.info("实践项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_subjects_data_from_mysql(data_time): """ 从mysql数据库获取实践课程原始数据 """ start = datetime.datetime.now() logger.info("开始获取实践课程数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/subjects.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['subject_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的课程名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['subject_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['learning_notes'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'subjects.csv', sep='\t', index=False, header=True) logger.info("实践课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_video_item_data_from_mysql(data_time): """ 从mysql数据库获取实践课程原始数据 """ start = datetime.datetime.now() logger.info("开始获取视频项目数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/video_items.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.dropna(axis=0, subset=['video_item_id'], inplace=True) data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True) data_df['updated_at'].fillna('2017-01-01 00:00:00', inplace=True) # 去除无用的视频项目名称 regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$' data_df = data_df[~data_df['video_name'].astype(str).str.match(regex_pattern)] data_df = data_df[~data_df['description'].astype(str).str.match(regex_pattern)] data_df=data_df[data_df["updated_at"]>data_time] data_df.to_csv(study_path_data + 'video_item.csv', sep='\t', index=False, header=True) logger.info("视频项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_watch_course_video_from_mysql(): """ 从mysql数据库获取观看教学视频历史数据 """ start = datetime.datetime.now() logger.info("开始获取观看教学视频数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/watch_course_videos.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.fillna(0, inplace=True) data_df.to_csv(study_path_data + 'watch_course_videos.csv', sep='\t', index=False, header=True) logger.info("观看教学视频数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_games_from_mysql(): """ 从mysql数据库获取学生闯关情况数据 """ start = datetime.datetime.now() logger.info("开始获取学生闯关情况数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/games.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.fillna(0., inplace=True) data_df.to_csv(study_path_data + 'games.csv', sep='\t', index=False, header=True) logger.info("学生闯关情况数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_student_works_from_mysql(): """ 从mysql数据库获取学生作业情况数据 """ start = datetime.datetime.now() logger.info("开始获取学生作业情况数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/student_works.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.fillna(0, inplace=True) data_df.to_csv(study_path_data + 'student_works.csv', sep='\t', index=False, header=True) logger.info("学生作业情况数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") def get_attachment_logs_from_mysql(): """ 从mysql数据库获取学生访问课件情况数据 """ start = datetime.datetime.now() logger.info("开始获取学生访问课件情况数据...") engine = create_engine('mysql+pymysql://' + str(mysql_test_user) + ':' + quote(mysql_test_passwd) + '@' + str(mysql_test_host) + ':' + str(mysql_test_port) + '/' + str(mysql_test_database)) sql_text = '' with open(study_path + 'data_analysis/attachment_logs.sql', 'r', encoding='utf-8') as fread: for line in fread.readlines(): line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ") sql_text += line data_df = pd.read_sql_query(sql_text, engine) data_df.fillna(0, inplace=True) data_df.to_csv(study_path_data + 'attachment_logs.csv', sep='\t', index=False, header=True) logger.info("学生访问课件情况数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒") if __name__ == '__main__': # 获取前N天日期 data_time=get_before_date(365*3) # 获取所有附件数据 # get_attachments_data_from_mysql(data_time) # 获取所有挑战数据 # get_challenge_data_from_mysql(data_time) # 获取所有课堂数据 # get_course_data_from_mysql(data_time) #获取所有考试数据 # get_exercise_data_from_mysql(data_time) #获取所有实训作业数据 # get_homework_shixun_data_from_mysql(data_time) #获取所有作业数据 # get_homework_data_from_mysql(data_time) #获取所有章节数据 # get_stage_data_from_mysql(data_time) #获取所有章节项目数据 # get_stage_item_data_from_mysql(data_time) # 获取所有实践项目数据 # get_shixuns_data_from_mysql(data_time) # 获取所有实践课程数据 # get_subjects_data_from_mysql(data_time) #获取所有课堂课程记录数据 # get_subject_course_data_from_mysql(data_time) #获取所有视频项目数据 # get_video_item_data_from_mysql(data_time) # get_watch_course_video_from_mysql() # get_games_from_mysql() # get_student_works_from_mysql() get_attachment_logs_from_mysql() print("数据获取完成!")