import pandas as pd import os import sys sys.path.append(os.getcwd()) from config import study_path_data from config import kg_data from tqdm import tqdm import warnings tqdm.pandas() warnings.filterwarnings('ignore') #根据原始表提取实体表和关系表 ## 实体表提取 # 附件实体提取 attachments = pd.read_csv(study_path_data+"attachments.csv",sep='\t',low_memory=False) attachments = attachments.drop_duplicates() attachments.to_csv(kg_data+'entity/attachments.csv',sep='\t',index=None) print("附件实体提取成功!") # 挑战实体提取 challenges = pd.read_csv(study_path_data+"challenges.csv",sep='\t',usecols=['challenge_id','challenge_name','created_at','updated_at','status','position','task_pass','score','visits','challenge_tags_count','challenge_tag'],low_memory=False) challenges = challenges.drop_duplicates() challenges.to_csv(kg_data+'entity/challenges.csv',sep='\t',index=None) print("挑战实体提取成功!") # 课堂实体提取 courses = pd.read_csv(study_path_data+"courses.csv",sep='\t',low_memory=False) courses = courses.drop_duplicates() courses.to_csv(kg_data+'entity/courses.csv',sep='\t',index=None) print("课堂实体提取成功!") # 考试实体提取 exercises = pd.read_csv(study_path_data+"exercises.csv",sep='\t',usecols=['exercise_id','exercise_name','exercise_description','exercise_status','created_at','updated_at'],low_memory=False) exercises = exercises.drop_duplicates() exercises.to_csv(kg_data+'entity/exercises.csv',sep='\t',index=None) print("考试实体提取成功!") # 作业实体提取 homeworks = pd.read_csv(study_path_data+"homeworks.csv",sep='\t',usecols=['homework_id','homework_name','description','homework_type','created_at','updated_at'],low_memory=False) homeworks = homeworks.drop_duplicates() homeworks.to_csv(kg_data+'entity/homeworks.csv',sep='\t',index=None) print("作业实体提取成功!") # 实训实体提取 shixuns = pd.read_csv(study_path_data+"shixuns.csv",sep='\t',low_memory=False) shixuns = shixuns.drop_duplicates() shixuns.to_csv(kg_data+'entity/shixuns.csv',sep='\t',index=None) print("实训实体提取成功!") # 章节实体提取 stages = pd.read_csv(study_path_data+"stages.csv",sep='\t',usecols=['stage_id','stage_name','description','shixuns_count','created_at','updated_at'],low_memory=False) stages = stages.drop_duplicates() stages.to_csv(kg_data+'entity/stages.csv',sep='\t',index=None) print("章节实体提取成功!") # 实践课堂实体提取 subjects = pd.read_csv(study_path_data+"subjects.csv",sep='\t',low_memory=False) subjects = subjects.drop_duplicates() subjects.to_csv(kg_data+'entity/subjects.csv',sep='\t',index=None) print("实践课堂实体提取成功!") # 教学视频实体提取 video_items = pd.read_csv(study_path_data+"video_item.csv",sep='\t',low_memory=False) video_items = video_items.drop_duplicates() video_items.to_csv(kg_data+'entity/video_items.csv',sep='\t',index=None) print("教学视频实体提取成功!") ##关系表提取 # 挑战-实训关系抽取 shixun_challenge = pd.read_csv(study_path_data+"challenges.csv",sep='\t',usecols=['shixun_id','challenge_id'],low_memory=False) shixun_challenge['relation'] = '实训关卡' shixun_challenge = shixun_challenge.drop_duplicates() shixun_challenge.to_csv(kg_data+'relation/shixun_challenge.csv',sep='\t',index=None) print("挑战-实训关系抽取成功!") # 考试-课堂关系抽取 course_exercise = pd.read_csv(study_path_data+"exercises.csv",sep='\t',usecols=['course_id','exercise_id'],low_memory=False) course_exercise['relation'] = '课堂考试' course_exercise = course_exercise.drop_duplicates() course_exercise.to_csv(kg_data+'relation/course_exercise.csv',sep='\t',index=None) print("课堂-考试关系抽取成功!") # 作业-实训关系抽取 shixun_homework = pd.read_csv(study_path_data+"homework_shixun.csv",sep='\t',usecols=['shixun_id','homework_id'],low_memory=False) shixun_homework['relation'] = '实训作业' shixun_homework = shixun_homework.drop_duplicates() shixun_homework.to_csv(kg_data+'relation/shixun_homework.csv',sep='\t',index=None) print("实训-作业关系抽取成功!") # 课程-章节关系抽取 subject_stage = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',usecols=['subject_id','stage_id'],low_memory=False) subject_stage['relation'] = '课程章节' subject_stage = subject_stage.drop_duplicates() subject_stage.to_csv(kg_data+'relation/subject_stage.csv',sep='\t',index=None) print("课程-章节关系抽取成功!") #章节-实训关系抽取 stage_shixun = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False) stage_shixun = stage_shixun[stage_shixun['item_type']=='Shixun'] # stage_shixun = stage_shixun[["stage_id","item_id"]] # stage_shixun['relation'] = '章节实训' stage_shixun = stage_shixun.drop_duplicates() stage_shixun.rename(columns={'item_id':'shixun_id'},inplace=True) stage_shixun.to_csv(kg_data+'relation/stage_shixun.csv',sep='\t',index=None) print("章节-实训关系抽取成功!") #章节-课件关系抽取 stage_attachment = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False) stage_attachment = stage_attachment[stage_attachment['item_type']=='Attachment'] # stage_attachment = stage_attachment[["stage_id","item_id"]] # stage_attachment['relation'] = '章节课件' stage_attachment = stage_attachment.drop_duplicates() stage_attachment.rename(columns={'item_id':'attachment_id'},inplace=True) stage_attachment.to_csv(kg_data+'relation/stage_attachment.csv',sep='\t',index=None) print("章节-课件关系抽取成功!") #章节-视频关系抽取 stage_video_item = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False) stage_video_item = stage_video_item[stage_video_item['item_type']=='VideoItem'] # stage_video_item = stage_video_item[["stage_id","item_id"]] # stage_video_item['relation'] = '章节视频' stage_video_item = stage_video_item.drop_duplicates() stage_video_item.rename(columns={'item_id':'video_item_id'},inplace=True) stage_video_item.to_csv(kg_data+'relation/stage_video_item.csv',sep='\t',index=None) print("章节-视频关系抽取成功!") #课堂-课程关系抽取 course_subject = pd.read_csv(study_path_data+"subject_course_records.csv",sep='\t',usecols=['subject_id','course_id'],low_memory=False) course_subject['relation'] = '课堂课程' course_subject = course_subject.drop_duplicates() course_subject.to_csv(kg_data+'relation/course_subject.csv',sep='\t',index=None) print("课堂-课程关系抽取成功!")