You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

71 lines
2.2 KiB

# 根据词性提取实践课程数据中的关键词
from ltp import LTP
from jieba import posseg
import pandas as pd
from tqdm import tqdm
import os
import sys
sys.path.append(os.getcwd())
from config import ltp_model_path, subjects_data_path
from config import user_dict_path, subjects_keywords_path
tqdm.pandas()
# 加载ltp模型
ltp = LTP(ltp_model_path)
# 读取所有实训数据
data = pd.read_csv(subjects_data_path, sep='\t', encoding='utf-8')
def ltp_seg_pos(text):
"""
使用ltp分词并根据词性提取出关键词
"""
keywords = []
for sent in tqdm(list(text)):
content = []
content.append(sent)
output = ltp.pipeline(content, tasks=['cws', 'pos'])
seg_list = output['cws'][0]
pos_list = output['pos'][0]
for seg, pos in zip(seg_list, pos_list):
if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']:
keywords.append(seg)
keywords = list(set(keywords))
return keywords
def build_keyword(sku_path, to_file, token_method='ltp'):
"""
通过词性标注提取关键词
"""
print('Start build keywords')
key_words = []
if token_method == 'jieba':
tokens = []
tokens = data['subject_name'].dropna().progress_apply(
lambda x: [token for token, pos in posseg.cut(x)
if pos in ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']])
key_words = [set([tk for idx, sample in tokens.iteritems()
for tk in sample if len(tk) > 1])]
key_words = [word for sentence in key_words for word in sentence]
elif token_method == 'ltp':
key_words = [key_word for key_word in ltp_seg_pos(data['subject_name'].dropna())]
print("Building keywords finished")
if not os.path.exists(os.path.dirname(to_file)):
os.mkdir(os.path.dirname(to_file))
if to_file is not None:
with open(to_file, 'w', encoding='utf-8') as f:
for word in key_words:
f.write(word + '\n')
return key_words
if __name__ == '__main__':
build_keyword(user_dict_path, to_file=subjects_keywords_path, token_method='jieba')