You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
2.9 KiB

from pprint import pprint
from paddlenlp import Taskflow
import pandas as pd
from tqdm import tqdm
import os
import sys
sys.path.append(os.getcwd())
from config import kg_data,study_path_data
def get_knowledge(name):
ner_fast = Taskflow("ner", mode="fast")#快速模式
title = ner_fast(name)
# print(title)
word_str=""
label = ["n",'nr','nz','s','ns','an','vn','c','f','u','v']
word = ['部分',"专题","阶段","Chap","","",'I','II',"III","pdf",'ppt','word','xlsx','txt','docx','DOC','zip','doc','xls','ch','v2','jpg','v4','PNG','png','pptx']
for w in title:
if w[1] in label and w[0] not in word:
word_str+=w[0]
return word_str
if __name__ == '__main__':
stage_item = pd.read_csv(study_path_data+'stage_items.csv',sep='\t')
shixuns = pd.read_csv(study_path_data+'shixuns.csv',sep='\t')
attachments = pd.read_csv(study_path_data+'attachments.csv',sep='\t')
video_items = pd.read_csv(study_path_data+'video_item.csv',sep='\t')
shixuns.rename(columns={'shixun_id':'item_id'},inplace=True)
attachments.rename(columns={'attachment_id':'item_id'},inplace=True)
video_items.rename(columns={'video_item_id':'item_id'},inplace=True)
#提取实训知识点
stage_shixun = stage_item[stage_item["item_type"]=='Shixun']
stage_shixuns = pd.merge(stage_shixun,shixuns,on='item_id')
stage_shixuns = stage_shixuns[['stage_id','item_id','item_type','shixun_name']]
stage_shixuns['knowledge'] = ''
for i in tqdm(range(len(stage_shixuns))):
stage_shixuns['knowledge'][i] = get_knowledge(stage_shixuns['shixun_name'][i])
stage_shixuns.rename(columns={'shixun_name':'item_name'},inplace=True)
stage_shixuns.to_csv(kg_data+'stage_shixuns.csv',sep='\t',index=False)
#提取课件知识点
stage_attachment = stage_item[stage_item["item_type"]=='Attachment']
stage_attachments = pd.merge(stage_attachment,attachments,on='item_id')
stage_attachments = stage_attachments[['stage_id','item_id','item_type','filename']]
stage_attachments['knowledge'] = ''
for i in tqdm(range(len(stage_attachments))):
stage_attachments['knowledge'][i] = get_knowledge(stage_attachments['filename'][i])
stage_attachments.rename(columns={'filename':'item_name'},inplace=True)
stage_attachments.to_csv(kg_data+'stage_attachments.csv',sep='\t',index=False)
#提取视频知识点
stage_video = stage_item[stage_item["item_type"]=='VideoItem']
stage_videos = pd.merge(stage_video,video_items,on='item_id')
stage_videos = stage_videos[['stage_id','item_id','item_type','video_name']]
stage_videos['knowledge'] = ''
for i in tqdm(range(len(stage_videos))):
stage_videos['knowledge'][i] = get_knowledge(stage_videos['video_name'][i])
stage_videos.rename(columns={'video_name':'item_name'},inplace=True)
stage_videos.to_csv(kg_data+'stage_videos.csv',sep='\t',index=False)