You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
117 lines
3.4 KiB
117 lines
3.4 KiB
import jieba
|
|
from sortedcontainers import SortedSet
|
|
import numpy as np
|
|
|
|
|
|
def stopwordslist(filepath):
|
|
'''
|
|
加载停用词文件并转成列表
|
|
:param filepath: 停用词文件路径
|
|
:return: 停用词列表
|
|
'''
|
|
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
|
|
return stopwords
|
|
|
|
def seg_sentence(sentence):
|
|
'''
|
|
对句子进行分词
|
|
:param sentence: 句子
|
|
:return: 分词后的列表
|
|
'''
|
|
sentence_seged = jieba.cut(sentence.strip())
|
|
stopwords = stopwordslist('./stopwords.txt') # 这里加载停用词的路径
|
|
outstr = ''
|
|
for word in sentence_seged:
|
|
if word not in stopwords:
|
|
if word != '\t':
|
|
outstr += word
|
|
outstr += " "
|
|
return list(jieba.cut(outstr))
|
|
|
|
def filter_uselessword(seg_list, uselesswords=[' ', '\r\n', '\n']):
|
|
'''
|
|
过滤初步分词后的无用符号,比如空格,\n等等
|
|
:param seg_list: 初步分词后产生的词的列表
|
|
:param uselesswords: 无用符号的列表
|
|
:return: 过滤后的分词列表
|
|
'''
|
|
for word in uselesswords:
|
|
b = filter(lambda x: x != word, seg_list)
|
|
seg_list = list(b)
|
|
return seg_list
|
|
|
|
def build_corpus(raw_dataframe):
|
|
'''
|
|
将预期输出和实际输出分别word2vec
|
|
:param path: 数据表的dataframe
|
|
:return: word2vec
|
|
'''
|
|
word_set = SortedSet()
|
|
want_answer_list = []
|
|
infact_answer_list = []
|
|
for i in range(raw_dataframe.shape[0]):
|
|
want_answer = str(raw_dataframe.iloc[i]['预期输出'])
|
|
infact_answer = str(raw_dataframe.iloc[i]['实际输出'])
|
|
tmp = []
|
|
for word in filter_uselessword(seg_sentence(want_answer)):
|
|
word_set.add(word)
|
|
tmp.append(word)
|
|
want_answer_list.append(tmp)
|
|
tmp = []
|
|
for word in filter_uselessword(seg_sentence(infact_answer)):
|
|
word_set.add(word)
|
|
tmp.append(word)
|
|
infact_answer_list.append(tmp)
|
|
|
|
# 0号位置空出来
|
|
word_set = list(word_set)
|
|
word_set.insert(0, ' ')
|
|
|
|
# padding好的预期输出vec和实际输出vec
|
|
want_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
|
|
infact_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
|
|
|
|
# 预期输出中的词换成词在词典中的索引
|
|
for i in range(len(want_answer_list)):
|
|
for j in range(len(want_answer_list[i])):
|
|
if want_answer_list[i][j] not in word_set:
|
|
want_answer_corpus[i][j] = len(word_set)
|
|
else:
|
|
want_answer_corpus[i][j] = word_set.index(want_answer_list[i][j])
|
|
|
|
# 实际输出中的词换成词在词典中的索引
|
|
for i in range(len(infact_answer_list)):
|
|
for j in range(len(infact_answer_list[i])):
|
|
if infact_answer_list[i][j] not in word_set:
|
|
infact_answer_corpus[i][j] = len(word_set)
|
|
else:
|
|
infact_answer_corpus[i][j] = word_set.index(infact_answer_list[i][j])
|
|
|
|
return want_answer_corpus, infact_answer_corpus
|
|
|
|
|
|
def label2onehot(label):
|
|
'''
|
|
label转成onehot
|
|
:param label: 是否正确的series
|
|
:return:onehot
|
|
'''
|
|
onehot = np.zeros((len(label), 2))
|
|
for i, l in enumerate(label):
|
|
if int(l) == 1:
|
|
onehot[i, 1] = 1
|
|
else:
|
|
onehot[i, 0] = 1
|
|
return onehot
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|