You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

135 lines
6.4 KiB

import pandas as pd
import os
import sys
sys.path.append(os.getcwd())
from config import study_path_data
from config import kg_data
from tqdm import tqdm
import warnings
tqdm.pandas()
warnings.filterwarnings('ignore')
#根据原始表提取实体表和关系表
## 实体表提取
# 附件实体提取
attachments = pd.read_csv(study_path_data+"attachments.csv",sep='\t',low_memory=False)
attachments = attachments.drop_duplicates()
attachments.to_csv(kg_data+'entity/attachments.csv',sep='\t',index=None)
print("附件实体提取成功!")
# 挑战实体提取
challenges = pd.read_csv(study_path_data+"challenges.csv",sep='\t',usecols=['challenge_id','challenge_name','created_at','updated_at','status','position','task_pass','score','visits','challenge_tags_count','challenge_tag'],low_memory=False)
challenges = challenges.drop_duplicates()
challenges.to_csv(kg_data+'entity/challenges.csv',sep='\t',index=None)
print("挑战实体提取成功!")
# 课堂实体提取
courses = pd.read_csv(study_path_data+"courses.csv",sep='\t',low_memory=False)
courses = courses.drop_duplicates()
courses.to_csv(kg_data+'entity/courses.csv',sep='\t',index=None)
print("课堂实体提取成功!")
# 考试实体提取
exercises = pd.read_csv(study_path_data+"exercises.csv",sep='\t',usecols=['exercise_id','exercise_name','exercise_description','exercise_status','created_at','updated_at'],low_memory=False)
exercises = exercises.drop_duplicates()
exercises.to_csv(kg_data+'entity/exercises.csv',sep='\t',index=None)
print("考试实体提取成功!")
# 作业实体提取
homeworks = pd.read_csv(study_path_data+"homeworks.csv",sep='\t',usecols=['homework_id','homework_name','description','homework_type','created_at','updated_at'],low_memory=False)
homeworks = homeworks.drop_duplicates()
homeworks.to_csv(kg_data+'entity/homeworks.csv',sep='\t',index=None)
print("作业实体提取成功!")
# 实训实体提取
shixuns = pd.read_csv(study_path_data+"shixuns.csv",sep='\t',low_memory=False)
shixuns = shixuns.drop_duplicates()
shixuns.to_csv(kg_data+'entity/shixuns.csv',sep='\t',index=None)
print("实训实体提取成功!")
# 章节实体提取
stages = pd.read_csv(study_path_data+"stages.csv",sep='\t',usecols=['stage_id','stage_name','description','shixuns_count','created_at','updated_at'],low_memory=False)
stages = stages.drop_duplicates()
stages.to_csv(kg_data+'entity/stages.csv',sep='\t',index=None)
print("章节实体提取成功!")
# 实践课堂实体提取
subjects = pd.read_csv(study_path_data+"subjects.csv",sep='\t',low_memory=False)
subjects = subjects.drop_duplicates()
subjects.to_csv(kg_data+'entity/subjects.csv',sep='\t',index=None)
print("实践课堂实体提取成功!")
# 教学视频实体提取
video_items = pd.read_csv(study_path_data+"video_item.csv",sep='\t',low_memory=False)
video_items = video_items.drop_duplicates()
video_items.to_csv(kg_data+'entity/video_items.csv',sep='\t',index=None)
print("教学视频实体提取成功!")
##关系表提取
# 挑战-实训关系抽取
shixun_challenge = pd.read_csv(study_path_data+"challenges.csv",sep='\t',usecols=['shixun_id','challenge_id'],low_memory=False)
shixun_challenge['relation'] = '实训关卡'
shixun_challenge = shixun_challenge.drop_duplicates()
shixun_challenge.to_csv(kg_data+'relation/shixun_challenge.csv',sep='\t',index=None)
print("挑战-实训关系抽取成功!")
# 考试-课堂关系抽取
course_exercise = pd.read_csv(study_path_data+"exercises.csv",sep='\t',usecols=['course_id','exercise_id'],low_memory=False)
course_exercise['relation'] = '课堂考试'
course_exercise = course_exercise.drop_duplicates()
course_exercise.to_csv(kg_data+'relation/course_exercise.csv',sep='\t',index=None)
print("课堂-考试关系抽取成功!")
# 作业-实训关系抽取
shixun_homework = pd.read_csv(study_path_data+"homework_shixun.csv",sep='\t',usecols=['shixun_id','homework_id'],low_memory=False)
shixun_homework['relation'] = '实训作业'
shixun_homework = shixun_homework.drop_duplicates()
shixun_homework.to_csv(kg_data+'relation/shixun_homework.csv',sep='\t',index=None)
print("实训-作业关系抽取成功!")
# 课程-章节关系抽取
subject_stage = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',usecols=['subject_id','stage_id'],low_memory=False)
subject_stage['relation'] = '课程章节'
subject_stage = subject_stage.drop_duplicates()
subject_stage.to_csv(kg_data+'relation/subject_stage.csv',sep='\t',index=None)
print("课程-章节关系抽取成功!")
#章节-实训关系抽取
stage_shixun = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False)
stage_shixun = stage_shixun[stage_shixun['item_type']=='Shixun']
# stage_shixun = stage_shixun[["stage_id","item_id"]]
# stage_shixun['relation'] = '章节实训'
stage_shixun = stage_shixun.drop_duplicates()
stage_shixun.rename(columns={'item_id':'shixun_id'},inplace=True)
stage_shixun.to_csv(kg_data+'relation/stage_shixun.csv',sep='\t',index=None)
print("章节-实训关系抽取成功!")
#章节-课件关系抽取
stage_attachment = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False)
stage_attachment = stage_attachment[stage_attachment['item_type']=='Attachment']
# stage_attachment = stage_attachment[["stage_id","item_id"]]
# stage_attachment['relation'] = '章节课件'
stage_attachment = stage_attachment.drop_duplicates()
stage_attachment.rename(columns={'item_id':'attachment_id'},inplace=True)
stage_attachment.to_csv(kg_data+'relation/stage_attachment.csv',sep='\t',index=None)
print("章节-课件关系抽取成功!")
#章节-视频关系抽取
stage_video_item = pd.read_csv(study_path_data+"stage_items.csv",sep='\t',low_memory=False)
stage_video_item = stage_video_item[stage_video_item['item_type']=='VideoItem']
# stage_video_item = stage_video_item[["stage_id","item_id"]]
# stage_video_item['relation'] = '章节视频'
stage_video_item = stage_video_item.drop_duplicates()
stage_video_item.rename(columns={'item_id':'video_item_id'},inplace=True)
stage_video_item.to_csv(kg_data+'relation/stage_video_item.csv',sep='\t',index=None)
print("章节-视频关系抽取成功!")
#课堂-课程关系抽取
course_subject = pd.read_csv(study_path_data+"subject_course_records.csv",sep='\t',usecols=['subject_id','course_id'],low_memory=False)
course_subject['relation'] = '课堂课程'
course_subject = course_subject.drop_duplicates()
course_subject.to_csv(kg_data+'relation/course_subject.csv',sep='\t',index=None)
print("课堂-课程关系抽取成功!")