from ltp import LTP from jieba import posseg import pandas as pd from tqdm import tqdm import torch import os import sys sys.path.append(os.getcwd()) from config import ltp_model_path, shixuns_data_path from config import user_dict_path, shixuns_keywords_path # 根据词性提取实训数据中的关键词 tqdm.pandas() # 加载ltp模型 ltp = LTP(ltp_model_path) if torch.cuda.is_available(): ltp.to("cuda") # 读取所有实训数据 data = pd.read_csv(shixuns_data_path, sep='\t', encoding='utf-8') def ltp_seg_pos(text): """ 使用ltp分词并根据词性提取出关键词 """ keywords = [] for sent in tqdm(list(text)): content = [] content.append(sent) output = ltp.pipeline(content, tasks=['cws', 'pos']) seg_list = output['cws'][0] pos_list = output['pos'][0] for seg, pos in zip(seg_list, pos_list): if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']: keywords.append(seg) keywords = list(set(keywords)) return keywords def build_keyword(sku_path, to_file, token_method='ltp'): """ 通过词性标注提取关键词 """ print('Start build keywords') key_words = [] if token_method == 'jieba': tokens = [] tokens = data['shixun_name'].dropna().progress_apply( lambda x: [token for token, pos in posseg.cut(x) if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']]) key_words = [set([tk for idx, sample in tokens.iteritems() for tk in sample if len(tk) > 1])] key_words = [word for sentence in key_words for word in sentence] elif token_method == 'ltp': key_words = [key_word for key_word in ltp_seg_pos(data['shixun_name'].dropna())] print("Building keywords finished") if not os.path.exists(os.path.dirname(to_file)): os.mkdir(os.path.dirname(to_file)) if to_file is not None: with open(to_file, 'w', encoding='utf-8') as f: for word in key_words: f.write(word + '\n') return key_words if __name__ == '__main__': build_keyword(user_dict_path, to_file=shixuns_keywords_path, token_method='jieba')