EduCoder_Study_RS/matching/shixun/build_keywords.py

from ltp import LTP
from jieba import posseg
import pandas as pd
from tqdm import tqdm
import torch
import os
import sys
sys.path.append(os.getcwd())
from config import ltp_model_path, shixuns_data_path
from config import user_dict_path, shixuns_keywords_path

# 根据词性提取实训数据中的关键词

tqdm.pandas()

# 加载ltp模型
ltp = LTP(ltp_model_path)

if torch.cuda.is_available():
    ltp.to("cuda")

# 读取所有实训数据
data = pd.read_csv(shixuns_data_path, sep='\t', encoding='utf-8')

def ltp_seg_pos(text):
    """
    使用ltp分词并根据词性提取出关键词
    """
    keywords = []
    for sent in tqdm(list(text)):
        content = []
        content.append(sent)
        output = ltp.pipeline(content, tasks=['cws', 'pos'])
        seg_list = output['cws'][0]
        pos_list = output['pos'][0]

        for seg, pos in zip(seg_list, pos_list):
            if pos in  ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']:
                keywords.append(seg)

    keywords = list(set(keywords))
    return keywords

def build_keyword(sku_path, to_file, token_method='ltp'):
    """
    通过词性标注提取关键词
    """
    print('Start build keywords')
    key_words = []

    if token_method == 'jieba':
        tokens = []
        tokens = data['shixun_name'].dropna().progress_apply(
            lambda x: [token for token, pos in posseg.cut(x)
            if pos in  ['b', 'n', 'nr', 'ns', 'nh', 'nt', 'vn', 'nz']])

        key_words = [set([tk for idx, sample in tokens.iteritems()
                        for tk in sample if len(tk) > 1])]
        key_words = [word for sentence in key_words for word in sentence]
    elif token_method == 'ltp':
        key_words = [key_word for key_word in ltp_seg_pos(data['shixun_name'].dropna())]

    print("Building keywords finished")

    if not os.path.exists(os.path.dirname(to_file)):
        os.mkdir(os.path.dirname(to_file))

    if to_file is not None:
        with open(to_file, 'w', encoding='utf-8') as f:
            for word in key_words:
                f.write(word + '\n')
    return key_words

if __name__ == '__main__':
    build_keyword(user_dict_path, to_file=shixuns_keywords_path, token_method='jieba')