You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
76 lines
2.2 KiB
76 lines
2.2 KiB
from ltp import LTP
|
|
from jieba import posseg
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
import torch
|
|
import os
|
|
import sys
|
|
sys.path.append(os.getcwd())
|
|
from config import ltp_model_path, shixuns_data_path
|
|
from config import user_dict_path, shixuns_keywords_path
|
|
|
|
# 根据词性提取实训数据中的关键词
|
|
|
|
tqdm.pandas()
|
|
|
|
# 加载ltp模型
|
|
ltp = LTP(ltp_model_path)
|
|
|
|
if torch.cuda.is_available():
|
|
ltp.to("cuda")
|
|
|
|
# 读取所有实训数据
|
|
data = pd.read_csv(shixuns_data_path, sep='\t', encoding='utf-8')
|
|
|
|
def ltp_seg_pos(text):
|
|
"""
|
|
使用ltp分词并根据词性提取出关键词
|
|
"""
|
|
keywords = []
|
|
for sent in tqdm(list(text)):
|
|
content = []
|
|
content.append(sent)
|
|
output = ltp.pipeline(content, tasks=['cws', 'pos'])
|
|
seg_list = output['cws'][0]
|
|
pos_list = output['pos'][0]
|
|
|
|
for seg, pos in zip(seg_list, pos_list):
|
|
if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']:
|
|
keywords.append(seg)
|
|
|
|
keywords = list(set(keywords))
|
|
return keywords
|
|
|
|
def build_keyword(sku_path, to_file, token_method='ltp'):
|
|
"""
|
|
通过词性标注提取关键词
|
|
"""
|
|
print('Start build keywords')
|
|
key_words = []
|
|
|
|
if token_method == 'jieba':
|
|
tokens = []
|
|
tokens = data['shixun_name'].dropna().progress_apply(
|
|
lambda x: [token for token, pos in posseg.cut(x)
|
|
if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']])
|
|
|
|
key_words = [set([tk for idx, sample in tokens.iteritems()
|
|
for tk in sample if len(tk) > 1])]
|
|
key_words = [word for sentence in key_words for word in sentence]
|
|
elif token_method == 'ltp':
|
|
key_words = [key_word for key_word in ltp_seg_pos(data['shixun_name'].dropna())]
|
|
|
|
print("Building keywords finished")
|
|
|
|
if not os.path.exists(os.path.dirname(to_file)):
|
|
os.mkdir(os.path.dirname(to_file))
|
|
|
|
if to_file is not None:
|
|
with open(to_file, 'w', encoding='utf-8') as f:
|
|
for word in key_words:
|
|
f.write(word + '\n')
|
|
return key_words
|
|
|
|
if __name__ == '__main__':
|
|
build_keyword(user_dict_path, to_file=shixuns_keywords_path, token_method='jieba')
|