You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
2.2 KiB

from ltp import LTP
from jieba import posseg
import pandas as pd
from tqdm import tqdm
import torch
import os
import sys
sys.path.append(os.getcwd())
from config import ltp_model_path, shixuns_data_path
from config import user_dict_path, shixuns_keywords_path
# 根据词性提取实训数据中的关键词
tqdm.pandas()
# 加载ltp模型
ltp = LTP(ltp_model_path)
if torch.cuda.is_available():
ltp.to("cuda")
# 读取所有实训数据
data = pd.read_csv(shixuns_data_path, sep='\t', encoding='utf-8')
def ltp_seg_pos(text):
"""
使用ltp分词并根据词性提取出关键词
"""
keywords = []
for sent in tqdm(list(text)):
content = []
content.append(sent)
output = ltp.pipeline(content, tasks=['cws', 'pos'])
seg_list = output['cws'][0]
pos_list = output['pos'][0]
for seg, pos in zip(seg_list, pos_list):
if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']:
keywords.append(seg)
keywords = list(set(keywords))
return keywords
def build_keyword(sku_path, to_file, token_method='ltp'):
"""
通过词性标注提取关键词
"""
print('Start build keywords')
key_words = []
if token_method == 'jieba':
tokens = []
tokens = data['shixun_name'].dropna().progress_apply(
lambda x: [token for token, pos in posseg.cut(x)
if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']])
key_words = [set([tk for idx, sample in tokens.iteritems()
for tk in sample if len(tk) > 1])]
key_words = [word for sentence in key_words for word in sentence]
elif token_method == 'ltp':
key_words = [key_word for key_word in ltp_seg_pos(data['shixun_name'].dropna())]
print("Building keywords finished")
if not os.path.exists(os.path.dirname(to_file)):
os.mkdir(os.path.dirname(to_file))
if to_file is not None:
with open(to_file, 'w', encoding='utf-8') as f:
for word in key_words:
f.write(word + '\n')
return key_words
if __name__ == '__main__':
build_keyword(user_dict_path, to_file=shixuns_keywords_path, token_method='jieba')