You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
2.9 KiB

5 months ago
from pprint import pprint
from paddlenlp import Taskflow
import pandas as pd
from tqdm import tqdm
import os
import sys
sys.path.append(os.getcwd())
from config import kg_data,study_path_data
def get_knowledge(name):
ner_fast = Taskflow("ner", mode="fast")#快速模式
title = ner_fast(name)
# print(title)
word_str=""
label = ["n",'nr','nz','s','ns','an','vn','c','f','u','v']
word = ['部分',"专题","阶段","Chap","","",'I','II',"III","pdf",'ppt','word','xlsx','txt','docx','DOC','zip','doc','xls','ch','v2','jpg','v4','PNG','png','pptx']
for w in title:
if w[1] in label and w[0] not in word:
word_str+=w[0]
return word_str
if __name__ == '__main__':
stage_item = pd.read_csv(study_path_data+'stage_items.csv',sep='\t')
shixuns = pd.read_csv(study_path_data+'shixuns.csv',sep='\t')
attachments = pd.read_csv(study_path_data+'attachments.csv',sep='\t')
video_items = pd.read_csv(study_path_data+'video_item.csv',sep='\t')
shixuns.rename(columns={'shixun_id':'item_id'},inplace=True)
attachments.rename(columns={'attachment_id':'item_id'},inplace=True)
video_items.rename(columns={'video_item_id':'item_id'},inplace=True)
#提取实训知识点
stage_shixun = stage_item[stage_item["item_type"]=='Shixun']
stage_shixuns = pd.merge(stage_shixun,shixuns,on='item_id')
stage_shixuns = stage_shixuns[['stage_id','item_id','item_type','shixun_name']]
stage_shixuns['knowledge'] = ''
for i in tqdm(range(len(stage_shixuns))):
stage_shixuns['knowledge'][i] = get_knowledge(stage_shixuns['shixun_name'][i])
stage_shixuns.rename(columns={'shixun_name':'item_name'},inplace=True)
stage_shixuns.to_csv(kg_data+'stage_shixuns.csv',sep='\t',index=False)
#提取课件知识点
stage_attachment = stage_item[stage_item["item_type"]=='Attachment']
stage_attachments = pd.merge(stage_attachment,attachments,on='item_id')
stage_attachments = stage_attachments[['stage_id','item_id','item_type','filename']]
stage_attachments['knowledge'] = ''
for i in tqdm(range(len(stage_attachments))):
stage_attachments['knowledge'][i] = get_knowledge(stage_attachments['filename'][i])
stage_attachments.rename(columns={'filename':'item_name'},inplace=True)
stage_attachments.to_csv(kg_data+'stage_attachments.csv',sep='\t',index=False)
#提取视频知识点
stage_video = stage_item[stage_item["item_type"]=='VideoItem']
stage_videos = pd.merge(stage_video,video_items,on='item_id')
stage_videos = stage_videos[['stage_id','item_id','item_type','video_name']]
stage_videos['knowledge'] = ''
for i in tqdm(range(len(stage_videos))):
stage_videos['knowledge'][i] = get_knowledge(stage_videos['video_name'][i])
stage_videos.rename(columns={'video_name':'item_name'},inplace=True)
stage_videos.to_csv(kg_data+'stage_videos.csv',sep='\t',index=False)