from pprint import pprint from paddlenlp import Taskflow import pandas as pd from tqdm import tqdm import os import sys sys.path.append(os.getcwd()) from config import kg_data,study_path_data def get_knowledge(name): ner_fast = Taskflow("ner", mode="fast")#快速模式 title = ner_fast(name) # print(title) word_str="" label = ["n",'nr','nz','s','ns','an','vn','c','f','u','v'] word = ['部分',"专题","阶段","Chap","上","下",'I','II',"III","pdf",'ppt','word','xlsx','txt','docx','DOC','zip','doc','xls','ch','v2','jpg','v4','PNG','png','pptx'] for w in title: if w[1] in label and w[0] not in word: word_str+=w[0] return word_str if __name__ == '__main__': stage_item = pd.read_csv(study_path_data+'stage_items.csv',sep='\t') shixuns = pd.read_csv(study_path_data+'shixuns.csv',sep='\t') attachments = pd.read_csv(study_path_data+'attachments.csv',sep='\t') video_items = pd.read_csv(study_path_data+'video_item.csv',sep='\t') shixuns.rename(columns={'shixun_id':'item_id'},inplace=True) attachments.rename(columns={'attachment_id':'item_id'},inplace=True) video_items.rename(columns={'video_item_id':'item_id'},inplace=True) #提取实训知识点 stage_shixun = stage_item[stage_item["item_type"]=='Shixun'] stage_shixuns = pd.merge(stage_shixun,shixuns,on='item_id') stage_shixuns = stage_shixuns[['stage_id','item_id','item_type','shixun_name']] stage_shixuns['knowledge'] = '' for i in tqdm(range(len(stage_shixuns))): stage_shixuns['knowledge'][i] = get_knowledge(stage_shixuns['shixun_name'][i]) stage_shixuns.rename(columns={'shixun_name':'item_name'},inplace=True) stage_shixuns.to_csv(kg_data+'stage_shixuns.csv',sep='\t',index=False) #提取课件知识点 stage_attachment = stage_item[stage_item["item_type"]=='Attachment'] stage_attachments = pd.merge(stage_attachment,attachments,on='item_id') stage_attachments = stage_attachments[['stage_id','item_id','item_type','filename']] stage_attachments['knowledge'] = '' for i in tqdm(range(len(stage_attachments))): stage_attachments['knowledge'][i] = get_knowledge(stage_attachments['filename'][i]) stage_attachments.rename(columns={'filename':'item_name'},inplace=True) stage_attachments.to_csv(kg_data+'stage_attachments.csv',sep='\t',index=False) #提取视频知识点 stage_video = stage_item[stage_item["item_type"]=='VideoItem'] stage_videos = pd.merge(stage_video,video_items,on='item_id') stage_videos = stage_videos[['stage_id','item_id','item_type','video_name']] stage_videos['knowledge'] = '' for i in tqdm(range(len(stage_videos))): stage_videos['knowledge'][i] = get_knowledge(stage_videos['video_name'][i]) stage_videos.rename(columns={'video_name':'item_name'},inplace=True) stage_videos.to_csv(kg_data+'stage_videos.csv',sep='\t',index=False)