You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
3.4 KiB

import jieba
from sortedcontainers import SortedSet
import numpy as np
def stopwordslist(filepath):
'''
加载停用词文件并转成列表
:param filepath: 停用词文件路径
:return: 停用词列表
'''
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(sentence):
'''
对句子进行分词
:param sentence: 句子
:return: 分词后的列表
'''
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('./stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return list(jieba.cut(outstr))
def filter_uselessword(seg_list, uselesswords=[' ', '\r\n', '\n']):
'''
过滤初步分词后的无用符号比如空格\n等等
:param seg_list: 初步分词后产生的词的列表
:param uselesswords: 无用符号的列表
:return: 过滤后的分词列表
'''
for word in uselesswords:
b = filter(lambda x: x != word, seg_list)
seg_list = list(b)
return seg_list
def build_corpus(raw_dataframe):
'''
将预期输出和实际输出分别word2vec
:param path: 数据表的dataframe
:return: word2vec
'''
word_set = SortedSet()
want_answer_list = []
infact_answer_list = []
for i in range(raw_dataframe.shape[0]):
want_answer = str(raw_dataframe.iloc[i]['预期输出'])
infact_answer = str(raw_dataframe.iloc[i]['实际输出'])
tmp = []
for word in filter_uselessword(seg_sentence(want_answer)):
word_set.add(word)
tmp.append(word)
want_answer_list.append(tmp)
tmp = []
for word in filter_uselessword(seg_sentence(infact_answer)):
word_set.add(word)
tmp.append(word)
infact_answer_list.append(tmp)
# 0号位置空出来
word_set = list(word_set)
word_set.insert(0, ' ')
# padding好的预期输出vec和实际输出vec
want_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
infact_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
# 预期输出中的词换成词在词典中的索引
for i in range(len(want_answer_list)):
for j in range(len(want_answer_list[i])):
if want_answer_list[i][j] not in word_set:
want_answer_corpus[i][j] = len(word_set)
else:
want_answer_corpus[i][j] = word_set.index(want_answer_list[i][j])
# 实际输出中的词换成词在词典中的索引
for i in range(len(infact_answer_list)):
for j in range(len(infact_answer_list[i])):
if infact_answer_list[i][j] not in word_set:
infact_answer_corpus[i][j] = len(word_set)
else:
infact_answer_corpus[i][j] = word_set.index(infact_answer_list[i][j])
return want_answer_corpus, infact_answer_corpus
def label2onehot(label):
'''
label转成onehot
:param label: 是否正确的series
:return:onehot
'''
onehot = np.zeros((len(label), 2))
for i, l in enumerate(label):
if int(l) == 1:
onehot[i, 1] = 1
else:
onehot[i, 0] = 1
return onehot
if __name__ == '__main__':
6 years ago
pass