You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
63 lines
2.9 KiB
63 lines
2.9 KiB
5 months ago
|
from pprint import pprint
|
||
|
from paddlenlp import Taskflow
|
||
|
import pandas as pd
|
||
|
from tqdm import tqdm
|
||
|
import os
|
||
|
import sys
|
||
|
sys.path.append(os.getcwd())
|
||
|
from config import kg_data,study_path_data
|
||
|
|
||
|
def get_knowledge(name):
|
||
|
ner_fast = Taskflow("ner", mode="fast")#快速模式
|
||
|
title = ner_fast(name)
|
||
|
# print(title)
|
||
|
word_str=""
|
||
|
label = ["n",'nr','nz','s','ns','an','vn','c','f','u','v']
|
||
|
word = ['部分',"专题","阶段","Chap","上","下",'I','II',"III","pdf",'ppt','word','xlsx','txt','docx','DOC','zip','doc','xls','ch','v2','jpg','v4','PNG','png','pptx']
|
||
|
for w in title:
|
||
|
if w[1] in label and w[0] not in word:
|
||
|
|
||
|
word_str+=w[0]
|
||
|
return word_str
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
|
||
|
stage_item = pd.read_csv(study_path_data+'stage_items.csv',sep='\t')
|
||
|
shixuns = pd.read_csv(study_path_data+'shixuns.csv',sep='\t')
|
||
|
attachments = pd.read_csv(study_path_data+'attachments.csv',sep='\t')
|
||
|
video_items = pd.read_csv(study_path_data+'video_item.csv',sep='\t')
|
||
|
|
||
|
shixuns.rename(columns={'shixun_id':'item_id'},inplace=True)
|
||
|
attachments.rename(columns={'attachment_id':'item_id'},inplace=True)
|
||
|
video_items.rename(columns={'video_item_id':'item_id'},inplace=True)
|
||
|
|
||
|
#提取实训知识点
|
||
|
stage_shixun = stage_item[stage_item["item_type"]=='Shixun']
|
||
|
stage_shixuns = pd.merge(stage_shixun,shixuns,on='item_id')
|
||
|
stage_shixuns = stage_shixuns[['stage_id','item_id','item_type','shixun_name']]
|
||
|
stage_shixuns['knowledge'] = ''
|
||
|
for i in tqdm(range(len(stage_shixuns))):
|
||
|
stage_shixuns['knowledge'][i] = get_knowledge(stage_shixuns['shixun_name'][i])
|
||
|
stage_shixuns.rename(columns={'shixun_name':'item_name'},inplace=True)
|
||
|
stage_shixuns.to_csv(kg_data+'stage_shixuns.csv',sep='\t',index=False)
|
||
|
|
||
|
#提取课件知识点
|
||
|
stage_attachment = stage_item[stage_item["item_type"]=='Attachment']
|
||
|
stage_attachments = pd.merge(stage_attachment,attachments,on='item_id')
|
||
|
stage_attachments = stage_attachments[['stage_id','item_id','item_type','filename']]
|
||
|
stage_attachments['knowledge'] = ''
|
||
|
for i in tqdm(range(len(stage_attachments))):
|
||
|
stage_attachments['knowledge'][i] = get_knowledge(stage_attachments['filename'][i])
|
||
|
stage_attachments.rename(columns={'filename':'item_name'},inplace=True)
|
||
|
stage_attachments.to_csv(kg_data+'stage_attachments.csv',sep='\t',index=False)
|
||
|
|
||
|
#提取视频知识点
|
||
|
stage_video = stage_item[stage_item["item_type"]=='VideoItem']
|
||
|
stage_videos = pd.merge(stage_video,video_items,on='item_id')
|
||
|
stage_videos = stage_videos[['stage_id','item_id','item_type','video_name']]
|
||
|
stage_videos['knowledge'] = ''
|
||
|
for i in tqdm(range(len(stage_videos))):
|
||
|
stage_videos['knowledge'][i] = get_knowledge(stage_videos['video_name'][i])
|
||
|
stage_videos.rename(columns={'video_name':'item_name'},inplace=True)
|
||
|
stage_videos.to_csv(kg_data+'stage_videos.csv',sep='\t',index=False)
|