baseline：acc在60到70之间

6 years ago · c840fa81c9
parent 6bbc684f45
commit c840fa81c9
4 changed files with 447 additions and 0 deletions
--- a/network.py
+++ b/network.py
@ -0,0 +1,45 @@
+from keras.preprocessing import sequence
+from keras.models import Sequential
+from keras.layers import Dense, Embedding
+from keras.layers import LSTM
+from keras.datasets import imdb
+
+max_features = 20000
+# cut texts after this number of words (among top max_features most common words)
+maxlen = 80
+batch_size = 32
+
+print('Loading data...')
+(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
+print(len(x_train), 'train sequences')
+print(len(x_test), 'test sequences')
+
+print('Pad sequences (samples x time)')
+x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
+x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
+print('x_train shape:', x_train.shape)
+print('x_test shape:', x_test.shape)
+
+
+print('Build model...')
+model = Sequential()
+model.add(Embedding(max_features, 128))
+model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
+model.add(Dense(1, activation='sigmoid'))
+
+# try using different optimizers and different optimizer configs
+model.compile(loss='binary_crossentropy',
+              optimizer='adam',
+              metrics=['accuracy'])
+
+print(model.summary())
+
+print('Train...')
+model.fit(x_train, y_train,
+          batch_size=batch_size,
+          epochs=15,
+          validation_data=(x_test, y_test))
+score, acc = model.evaluate(x_test, y_test,
+                            batch_size=batch_size)
+print('Test score:', score)
+print('Test accuracy:', acc)
--- a/stopwords.txt
+++ b/stopwords.txt
@ -0,0 +1,248 @@
+\n
+
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
+
+,
+-
+--
+.
+..
+...
+_
+......
+...................
+./
+.一
+/
+//
+:
+://
+::
+;
+<
+=
+>
+>>
+?
+@
+[
+\
+]
+^
+_
+`
+|
+}
+~
+~~~~
+·
+×
+×××
+Δ
+Ψ
+γ
+μ
+φ
+φ．
+В
+—
+——
+———
+‘
+’
+’‘
+“
+”
+”，
+…
+……
+…………………………………………………
+′∈
+′｜
+℃
+Ⅲ
+↑
+→
+∈［
+∪φ∈
+≈
+①
+②
+②
+③
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+──
+■
+▲
+　
+、
+。
+〈
+〉
+《
+》
+》），
+」
+『
+』
+【
+】
+〔
+〕
+〕〔
+︿
+！
+＃
+＄
+％
+＆
+＇
+（
+）
+）÷（１－
+）、
+＊
+＋
+＋ξ
+＋＋
+，
+，也
+－
+－β
+－－
+－［＊］－
+．
+／
+
+：
+；
+＜
+＜±
+＜Δ
+＜λ
+＜φ
+＜＜
+＝
+＝″
+＝☆
+＝（
+＝－
+＝［
+＝｛
+＞
+？
+＠
+［
+［①①］
+［①②］
+［①③］
+［①④］
+［①⑤］
+［①⑥］
+［①⑦］
+［①⑧］
+［①⑨］
+［①Ａ］
+［①Ｂ］
+［①Ｃ］
+［①Ｄ］
+［①Ｅ］
+［①］
+［①ａ］
+［①ｃ］
+［①ｄ］
+［①ｅ］
+［①ｆ］
+［①ｇ］
+［①ｈ］
+［①ｉ］
+［①ｏ］
+［②
+［②①］
+［②②］
+［②③］
+［②④
+［②⑤］
+［②⑥］
+［②⑦］
+［②⑧］
+［②⑩］
+［②Ｂ］
+［②Ｇ］
+［②］
+［②ａ］
+［②ｂ］
+［②ｃ］
+［②ｄ］
+［②ｅ］
+［②ｆ］
+［②ｇ］
+［②ｈ］
+［②ｉ］
+［②ｊ］
+［③①］
+［③⑩］
+［③Ｆ］
+［③］
+［③ａ］
+［③ｂ］
+［③ｃ］
+［③ｄ］
+［③ｅ］
+［③ｇ］
+［③ｈ］
+［④］
+［④ａ］
+［④ｂ］
+［④ｃ］
+［④ｄ］
+［④ｅ］
+［⑤］
+［⑤］］
+［⑤ａ］
+［⑤ｂ］
+［⑤ｄ］
+［⑤ｅ］
+［⑤ｆ］
+［⑥］
+［⑦］
+［⑧］
+［⑨］
+［⑩］
+［＊］
+［－
+［］
+］
+］∧′＝［
+］［
+＿
+ａ］
+ｂ］
+ｃ］
+ｅ］
+ｆ］
+｛
+｛－
+｜
+｝
+｝＞
+～
+～±
+～＋
+￥
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,154 @@
+import jieba
+import pandas as pd
+from sortedcontainers import SortedSet
+import numpy as np
+from sklearn.model_selection import train_test_split
+from keras.layers import Dense, Embedding, Input, Flatten
+from keras.layers import LSTM, GRU, Dropout
+from keras.models import Model
+import keras
+from keras.utils import plot_model
+
+
+def stopwordslist(filepath):
+    '''
+    加载停用词文件并转成列表
+    :param filepath: 停用词文件路径
+    :return: 停用词列表
+    '''
+    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
+    return stopwords
+
+def seg_sentence(sentence):
+    '''
+    对句子进行分词
+    :param sentence: 句子
+    :return: 分词后的列表
+    '''
+    sentence_seged = jieba.cut(sentence.strip())
+    stopwords = stopwordslist('./stopwords.txt')  # 这里加载停用词的路径
+    outstr = ''
+    for word in sentence_seged:
+        if word not in stopwords:
+            if word != '\t':
+                outstr += word
+                outstr += " "
+    return list(jieba.cut(outstr))
+
+def filter_uselessword(seg_list, uselesswords=[' ', '\r\n', '\n']):
+    '''
+    过滤初步分词后的无用符号，比如空格，\n等等
+    :param seg_list: 初步分词后产生的词的列表
+    :param uselesswords: 无用符号的列表
+    :return: 过滤后的分词列表
+    '''
+    for word in uselesswords:
+        b = filter(lambda x: x != word, seg_list)
+        seg_list = list(b)
+    return seg_list
+
+def build_corpus(raw_dataframe):
+    '''
+    将预期输出和实际输出分别word2vec
+    :param path: 数据表的dataframe
+    :return: word2vec
+    '''
+    word_set = SortedSet()
+    want_answer_list = []
+    infact_answer_list = []
+    for i in range(raw_dataframe.shape[0]):
+        want_answer = str(raw_dataframe.iloc[i]['预期输出'])
+        infact_answer = str(raw_dataframe.iloc[i]['实际输出'])
+        tmp = []
+        for word in filter_uselessword(seg_sentence(want_answer)):
+            word_set.add(word)
+            tmp.append(word)
+        want_answer_list.append(tmp)
+        tmp = []
+        for word in filter_uselessword(seg_sentence(infact_answer)):
+            word_set.add(word)
+            tmp.append(word)
+        infact_answer_list.append(tmp)
+
+    # 0号位置空出来
+    word_set = list(word_set)
+    word_set.insert(0, ' ')
+
+    # padding好的预期输出vec和实际输出vec
+    want_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
+    infact_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set)))
+
+    # 预期输出中的词换成词在词典中的索引
+    for i in range(len(want_answer_list)):
+        for j in range(len(want_answer_list[i])):
+            if want_answer_list[i][j] not in word_set:
+                want_answer_corpus[i][j] = len(word_set)
+            else:
+                want_answer_corpus[i][j] = word_set.index(want_answer_list[i][j])
+
+    # 实际输出中的词换成词在词典中的索引
+    for i in range(len(infact_answer_list)):
+        for j in range(len(infact_answer_list[i])):
+            if infact_answer_list[i][j] not in word_set:
+                infact_answer_corpus[i][j] = len(word_set)
+            else:
+                infact_answer_corpus[i][j] = word_set.index(infact_answer_list[i][j])
+
+    return want_answer_corpus, infact_answer_corpus
+
+
+def label2onehot(label):
+    '''
+    label转成onehot
+    :param label: 是否正确的series
+    :return:onehot
+    '''
+    onehot = np.zeros((len(label), 2))
+    for i, l in enumerate(label):
+        if int(l) == 1:
+            onehot[i, 1] = 1
+        else:
+            onehot[i, 0] = 1
+    return onehot
+
+
+
+if __name__ == '__main__':
+    df = pd.read_excel('./预期输出与实际输出数据表.xlsx')
+    want_answer_corpus, infact_answer_corpus = build_corpus(df)
+    onehot = label2onehot(df['是否正确'])
+    x_train_1, x_test_1, y_train, y_test = train_test_split(want_answer_corpus, onehot, random_state=2333)
+    x_train_2, x_test_2, _, _ = train_test_split(infact_answer_corpus, onehot, random_state=2333)
+
+    inputs_want_answer = Input(shape=(len(want_answer_corpus[0]), ), name='want_answer_input')
+    inputs_infact_answer = Input(shape=(len(infact_answer_corpus[0]), ), name='infact_answer_input')
+    x_1 = Embedding(len(want_answer_corpus[0]), 64, name='want_answer_embedding')(inputs_want_answer)
+    x_2 = Embedding(len(infact_answer_corpus[0]), 64, name='infact_answer_embedding')(inputs_infact_answer)
+    x_1 = GRU(64, dropout=0.5, return_sequences=0.2)(x_1)
+    x_2 = GRU(64, dropout=0.5, return_sequences=0.2)(x_2)
+    x = keras.layers.concatenate([x_1, x_2])
+    x = Flatten()(x)
+    x = Dense(64, activation='relu')(x)
+    predictions = Dense(2, activation='softmax')(x)
+    model = Model(inputs=[inputs_want_answer, inputs_infact_answer], outputs=predictions)
+
+    # plot_model(model, to_file='model.png')
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='adam',
+                  metrics=['accuracy'])
+
+    print('Train...')
+    model.fit([x_train_1, x_train_2], y_train,
+              batch_size=16,
+              epochs=60)
+    score, acc = model.evaluate([x_test_1, x_test_2], y_test,
+                                batch_size=8)
+    print('Test score:', score)
+    print('Test accuracy:', acc)
+
+
+
+
+
+
--- a/预期输出与实际输出数据表.xlsx
+++ b/预期输出与实际输出数据表.xlsx