baseline:acc在60到70之间

master
aolingwen 6 years ago
parent 6bbc684f45
commit c840fa81c9

@ -0,0 +1,45 @@
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 80
batch_size = 32
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print(model.summary())
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=15,
validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

@ -0,0 +1,248 @@
\n
!
"
#
$
%
&
'
(
)
*
+
,
-
--
.
..
...
_
......
...................
./
.一
/
//
:
://
::
;
<
=
>
>>
?
@
[
\
]
^
_
`
|
}
~
~~~~
·
×
×××
Δ
Ψ
γ
μ
φ
φ.
В
——
———
”,
……
…………………………………………………
′∈
∈[
∪φ∈
──
 
》),
︿
)÷(1-
)、
+ξ
,也
-β
<±
<Δ
<λ
<φ
=″
=☆
[①①]
[①②]
[①③]
[①④]
[①⑤]
[①⑥]
[①⑦]
[①⑧]
[①⑨]
[①A]
[①B]
[①C]
[①D]
[①E]
[①]
[①a]
[①c]
[①d]
[①e]
[①f]
[①g]
[①h]
[①i]
[①o]
[②
[②①]
[②②]
[②③]
[②④
[②⑤]
[②⑥]
[②⑦]
[②⑧]
[②⑩]
[②B]
[②G]
[②]
[②a]
[②b]
[②c]
[②d]
[②e]
[②f]
[②g]
[②h]
[②i]
[②j]
[③①]
[③⑩]
[③F]
[③]
[③a]
[③b]
[③c]
[③d]
[③e]
[③g]
[③h]
[④]
[④a]
[④b]
[④c]
[④d]
[④e]
[⑤]
[⑤]]
[⑤a]
[⑤b]
[⑤d]
[⑤e]
[⑤f]
[⑥]
[⑦]
[⑧]
[⑨]
[⑩]
]∧′=[
_
~±

@ -0,0 +1,154 @@
import jieba
import pandas as pd
from sortedcontainers import SortedSet
import numpy as np
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Embedding, Input, Flatten
from keras.layers import LSTM, GRU, Dropout
from keras.models import Model
import keras
from keras.utils import plot_model
def stopwordslist(filepath):
'''
加载停用词文件并转成列表
:param filepath: 停用词文件路径
:return: 停用词列表
'''
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(sentence):
'''
对句子进行分词
:param sentence: 句子
:return: 分词后的列表
'''
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('./stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return list(jieba.cut(outstr))
def filter_uselessword(seg_list, uselesswords=[' ', '\r\n', '\n']):
'''
过滤初步分词后的无用符号比如空格\n等等
:param seg_list: 初步分词后产生的词的列表
:param uselesswords: 无用符号的列表
:return: 过滤后的分词列表
'''
for word in uselesswords:
b = filter(lambda x: x != word, seg_list)
seg_list = list(b)
return seg_list
def build_corpus(raw_dataframe):
'''
将预期输出和实际输出分别word2vec
:param path: 数据表的dataframe
:return: word2vec
'''
word_set = SortedSet()
want_answer_list = []
infact_answer_list = []
for i in range(raw_dataframe.shape[0]):
want_answer = str(raw_dataframe.iloc[i]['预期输出'])
infact_answer = str(raw_dataframe.iloc[i]['实际输出'])
tmp = []
for word in filter_uselessword(seg_sentence(want_answer)):
word_set.add(word)
tmp.append(word)
want_answer_list.append(tmp)
tmp = []
for word in filter_uselessword(seg_sentence(infact_answer)):
word_set.add(word)
tmp.append(word)
infact_answer_list.append(tmp)
# 0号位置空出来
word_set = list(word_set)
word_set.insert(0, ' ')
# padding好的预期输出vec和实际输出vec
want_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
infact_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
# 预期输出中的词换成词在词典中的索引
for i in range(len(want_answer_list)):
for j in range(len(want_answer_list[i])):
if want_answer_list[i][j] not in word_set:
want_answer_corpus[i][j] = len(word_set)
else:
want_answer_corpus[i][j] = word_set.index(want_answer_list[i][j])
# 实际输出中的词换成词在词典中的索引
for i in range(len(infact_answer_list)):
for j in range(len(infact_answer_list[i])):
if infact_answer_list[i][j] not in word_set:
infact_answer_corpus[i][j] = len(word_set)
else:
infact_answer_corpus[i][j] = word_set.index(infact_answer_list[i][j])
return want_answer_corpus, infact_answer_corpus
def label2onehot(label):
'''
label转成onehot
:param label: 是否正确的series
:return:onehot
'''
onehot = np.zeros((len(label), 2))
for i, l in enumerate(label):
if int(l) == 1:
onehot[i, 1] = 1
else:
onehot[i, 0] = 1
return onehot
if __name__ == '__main__':
df = pd.read_excel('./预期输出与实际输出数据表.xlsx')
want_answer_corpus, infact_answer_corpus = build_corpus(df)
onehot = label2onehot(df['是否正确'])
x_train_1, x_test_1, y_train, y_test = train_test_split(want_answer_corpus, onehot, random_state=2333)
x_train_2, x_test_2, _, _ = train_test_split(infact_answer_corpus, onehot, random_state=2333)
inputs_want_answer = Input(shape=(len(want_answer_corpus[0]), ), name='want_answer_input')
inputs_infact_answer = Input(shape=(len(infact_answer_corpus[0]), ), name='infact_answer_input')
x_1 = Embedding(len(want_answer_corpus[0]), 64, name='want_answer_embedding')(inputs_want_answer)
x_2 = Embedding(len(infact_answer_corpus[0]), 64, name='infact_answer_embedding')(inputs_infact_answer)
x_1 = GRU(64, dropout=0.5, return_sequences=0.2)(x_1)
x_2 = GRU(64, dropout=0.5, return_sequences=0.2)(x_2)
x = keras.layers.concatenate([x_1, x_2])
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
predictions = Dense(2, activation='softmax')(x)
model = Model(inputs=[inputs_want_answer, inputs_infact_answer], outputs=predictions)
# plot_model(model, to_file='model.png')
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print('Train...')
model.fit([x_train_1, x_train_2], y_train,
batch_size=16,
epochs=60)
score, acc = model.evaluate([x_test_1, x_test_2], y_test,
batch_size=8)
print('Test score:', score)
print('Test accuracy:', acc)
Loading…
Cancel
Save