You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
135 lines
6.4 KiB
135 lines
6.4 KiB
import pandas as pd
|
|
import os
|
|
import sys
|
|
sys.path.append(os.getcwd())
|
|
from config import study_path_data
|
|
from config import kg_data
|
|
from tqdm import tqdm
|
|
import warnings
|
|
|
|
tqdm.pandas()
|
|
warnings.filterwarnings('ignore')
|
|
#根据原始表提取实体表和关系表
|
|
|
|
## 实体表提取
|
|
# 附件实体提取
|
|
attachments = pd.read_csv(study_path_data+"attachments.csv",sep='\t',low_memory=False)
|
|
attachments = attachments.drop_duplicates()
|
|
attachments.to_csv(kg_data+'entity/attachments.csv',sep='\t',index=None)
|
|
print("附件实体提取成功!")
|
|
|
|
# 挑战实体提取
|
|
challenges = pd.read_csv(study_path_data+"challenges.csv",sep='\t',usecols=['challenge_id','challenge_name','created_at','updated_at','status','position','task_pass','score','visits','challenge_tags_count','challenge_tag'],low_memory=False)
|
|
challenges = challenges.drop_duplicates()
|
|
challenges.to_csv(kg_data+'entity/challenges.csv',sep='\t',index=None)
|
|
print("挑战实体提取成功!")
|
|
|
|
# 课堂实体提取
|
|
courses = pd.read_csv(study_path_data+"courses.csv",sep='\t',low_memory=False)
|
|
courses = courses.drop_duplicates()
|
|
courses.to_csv(kg_data+'entity/courses.csv',sep='\t',index=None)
|
|
print("课堂实体提取成功!")
|
|
|
|
# 考试实体提取
|
|
exercises = pd.read_csv(study_path_data+"exercises.csv",sep='\t',usecols=['exercise_id','exercise_name','exercise_description','exercise_status','created_at','updated_at'],low_memory=False)
|
|
exercises = exercises.drop_duplicates()
|
|
exercises.to_csv(kg_data+'entity/exercises.csv',sep='\t',index=None)
|
|
print("考试实体提取成功!")
|
|
|
|
# 作业实体提取
|
|
homeworks = pd.read_csv(study_path_data+"homeworks.csv",sep='\t',usecols=['homework_id','homework_name','description','homework_type','created_at','updated_at'],low_memory=False)
|
|
homeworks = homeworks.drop_duplicates()
|
|
homeworks.to_csv(kg_data+'entity/homeworks.csv',sep='\t',index=None)
|
|
print("作业实体提取成功!")
|
|
|
|
# 实训实体提取
|
|
shixuns = pd.read_csv(study_path_data+"shixuns.csv",sep='\t',low_memory=False)
|
|
shixuns = shixuns.drop_duplicates()
|
|
shixuns.to_csv(kg_data+'entity/shixuns.csv',sep='\t',index=None)
|
|
print("实训实体提取成功!")
|
|
|
|
# 章节实体提取
|
|
stages = pd.read_csv(study_path_data+"stages.csv",sep='\t',usecols=['stage_id','stage_name','description','shixuns_count','created_at','updated_at'],low_memory=False)
|
|
stages = stages.drop_duplicates()
|
|
stages.to_csv(kg_data+'entity/stages.csv',sep='\t',index=None)
|
|
print("章节实体提取成功!")
|
|
|
|
# 实践课堂实体提取
|
|
subjects = pd.read_csv(study_path_data+"subjects.csv",sep='\t',low_memory=False)
|
|
subjects = subjects.drop_duplicates()
|
|
subjects.to_csv(kg_data+'entity/subjects.csv',sep='\t',index=None)
|
|
print("实践课堂实体提取成功!")
|
|
|
|
# 教学视频实体提取
|
|
video_items = pd.read_csv(study_path_data+"video_item.csv",sep='\t',low_memory=False)
|
|
video_items = video_items.drop_duplicates()
|
|
video_items.to_csv(kg_data+'entity/video_items.csv',sep='\t',index=None)
|
|
print("教学视频实体提取成功!")
|
|
|
|
##关系表提取
|
|
# 挑战-实训关系抽取
|
|
shixun_challenge = pd.read_csv(study_path_data+"challenges.csv",sep='\t',usecols=['shixun_id','challenge_id'],low_memory=False)
|
|
shixun_challenge['relation'] = '实训关卡'
|
|
shixun_challenge = shixun_challenge.drop_duplicates()
|
|
shixun_challenge.to_csv(kg_data+'relation/shixun_challenge.csv',sep='\t',index=None)
|
|
print("挑战-实训关系抽取成功!")
|
|
|
|
# 考试-课堂关系抽取
|
|
course_exercise = pd.read_csv(study_path_data+"exercises.csv",sep='\t',usecols=['course_id','exercise_id'],low_memory=False)
|
|
course_exercise['relation'] = '课堂考试'
|
|
course_exercise = course_exercise.drop_duplicates()
|
|
course_exercise.to_csv(kg_data+'relation/course_exercise.csv',sep='\t',index=None)
|
|
print("课堂-考试关系抽取成功!")
|
|
|
|
# 作业-实训关系抽取
|
|
shixun_homework = pd.read_csv(study_path_data+"homework_shixun.csv",sep='\t',usecols=['shixun_id','homework_id'],low_memory=False)
|
|
shixun_homework['relation'] = '实训作业'
|
|
shixun_homework = shixun_homework.drop_duplicates()
|
|
shixun_homework.to_csv(kg_data+'relation/shixun_homework.csv',sep='\t',index=None)
|
|
print("实训-作业关系抽取成功!")
|
|
|
|
# 课程-章节关系抽取
|
|
subject_stage = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',usecols=['subject_id','stage_id'],low_memory=False)
|
|
subject_stage['relation'] = '课程章节'
|
|
subject_stage = subject_stage.drop_duplicates()
|
|
subject_stage.to_csv(kg_data+'relation/subject_stage.csv',sep='\t',index=None)
|
|
print("课程-章节关系抽取成功!")
|
|
|
|
#章节-实训关系抽取
|
|
stage_shixun = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False)
|
|
stage_shixun = stage_shixun[stage_shixun['item_type']=='Shixun']
|
|
# stage_shixun = stage_shixun[["stage_id","item_id"]]
|
|
# stage_shixun['relation'] = '章节实训'
|
|
stage_shixun = stage_shixun.drop_duplicates()
|
|
stage_shixun.rename(columns={'item_id':'shixun_id'},inplace=True)
|
|
stage_shixun.to_csv(kg_data+'relation/stage_shixun.csv',sep='\t',index=None)
|
|
print("章节-实训关系抽取成功!")
|
|
|
|
#章节-课件关系抽取
|
|
stage_attachment = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False)
|
|
stage_attachment = stage_attachment[stage_attachment['item_type']=='Attachment']
|
|
# stage_attachment = stage_attachment[["stage_id","item_id"]]
|
|
# stage_attachment['relation'] = '章节课件'
|
|
stage_attachment = stage_attachment.drop_duplicates()
|
|
stage_attachment.rename(columns={'item_id':'attachment_id'},inplace=True)
|
|
stage_attachment.to_csv(kg_data+'relation/stage_attachment.csv',sep='\t',index=None)
|
|
print("章节-课件关系抽取成功!")
|
|
|
|
#章节-视频关系抽取
|
|
stage_video_item = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False)
|
|
stage_video_item = stage_video_item[stage_video_item['item_type']=='VideoItem']
|
|
# stage_video_item = stage_video_item[["stage_id","item_id"]]
|
|
# stage_video_item['relation'] = '章节视频'
|
|
stage_video_item = stage_video_item.drop_duplicates()
|
|
stage_video_item.rename(columns={'item_id':'video_item_id'},inplace=True)
|
|
stage_video_item.to_csv(kg_data+'relation/stage_video_item.csv',sep='\t',index=None)
|
|
print("章节-视频关系抽取成功!")
|
|
|
|
#课堂-课程关系抽取
|
|
course_subject = pd.read_csv(study_path_data+"subject_course_records.csv",sep='\t',usecols=['subject_id','course_id'],low_memory=False)
|
|
course_subject['relation'] = '课堂课程'
|
|
course_subject = course_subject.drop_duplicates()
|
|
course_subject.to_csv(kg_data+'relation/course_subject.csv',sep='\t',index=None)
|
|
print("课堂-课程关系抽取成功!")
|
|
|