diff --git a/network.py b/network.py index e69de29..59208d7 100644 --- a/network.py +++ b/network.py @@ -0,0 +1,45 @@ +from keras.preprocessing import sequence +from keras.models import Sequential +from keras.layers import Dense, Embedding +from keras.layers import LSTM +from keras.datasets import imdb + +max_features = 20000 +# cut texts after this number of words (among top max_features most common words) +maxlen = 80 +batch_size = 32 + +print('Loading data...') +(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) +print(len(x_train), 'train sequences') +print(len(x_test), 'test sequences') + +print('Pad sequences (samples x time)') +x_train = sequence.pad_sequences(x_train, maxlen=maxlen) +x_test = sequence.pad_sequences(x_test, maxlen=maxlen) +print('x_train shape:', x_train.shape) +print('x_test shape:', x_test.shape) + + +print('Build model...') +model = Sequential() +model.add(Embedding(max_features, 128)) +model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) +model.add(Dense(1, activation='sigmoid')) + +# try using different optimizers and different optimizer configs +model.compile(loss='binary_crossentropy', + optimizer='adam', + metrics=['accuracy']) + +print(model.summary()) + +print('Train...') +model.fit(x_train, y_train, + batch_size=batch_size, + epochs=15, + validation_data=(x_test, y_test)) +score, acc = model.evaluate(x_test, y_test, + batch_size=batch_size) +print('Test score:', score) +print('Test accuracy:', acc) \ No newline at end of file diff --git a/stopwords.txt b/stopwords.txt new file mode 100644 index 0000000..fde63a5 --- /dev/null +++ b/stopwords.txt @@ -0,0 +1,248 @@ +\n + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +-- +. +.. +... +_ +...... +................... +./ +.一 +/ +// +: +:// +:: +; +< += +> +>> +? +@ +[ +\ +] +^ +_ +` +| +} +~ +~~~~ +· +× +××× +Δ +Ψ +γ +μ +φ +φ. +В +— +—— +——— +‘ +’ +’‘ +“ +” +”, +… +…… +………………………………………………… +′∈ +′| +℃ +Ⅲ +↑ +→ +∈[ +∪φ∈ +≈ +① +② +② +③ +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +── +■ +▲ +  +、 +。 +〈 +〉 +《 +》 +》), +」 +『 +』 +【 +】 +〔 +〕 +〕〔 +︿ +! +# +$ +% +& +' +( +) +)÷(1- +)、 +* ++ ++ξ +++ +, +,也 +- +-β +-- +-[*]- +. +/ + +: +; +< +<± +<Δ +<λ +<φ +<< += +=″ +=☆ +=( +=- +=[ +={ +> +? +@ +[ +[①①] +[①②] +[①③] +[①④] +[①⑤] +[①⑥] +[①⑦] +[①⑧] +[①⑨] +[①A] +[①B] +[①C] +[①D] +[①E] +[①] +[①a] +[①c] +[①d] +[①e] +[①f] +[①g] +[①h] +[①i] +[①o] +[② +[②①] +[②②] +[②③] +[②④ +[②⑤] +[②⑥] +[②⑦] +[②⑧] +[②⑩] +[②B] +[②G] +[②] +[②a] +[②b] +[②c] +[②d] +[②e] +[②f] +[②g] +[②h] +[②i] +[②j] +[③①] +[③⑩] +[③F] +[③] +[③a] +[③b] +[③c] +[③d] +[③e] +[③g] +[③h] +[④] +[④a] +[④b] +[④c] +[④d] +[④e] +[⑤] +[⑤]] +[⑤a] +[⑤b] +[⑤d] +[⑤e] +[⑤f] +[⑥] +[⑦] +[⑧] +[⑨] +[⑩] +[*] +[- +[] +] +]∧′=[ +][ +_ +a] +b] +c] +e] +f] +{ +{- +| +} +}> +~ +~± +~+ +¥ \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..f2c9cc5 --- /dev/null +++ b/utils.py @@ -0,0 +1,154 @@ +import jieba +import pandas as pd +from sortedcontainers import SortedSet +import numpy as np +from sklearn.model_selection import train_test_split +from keras.layers import Dense, Embedding, Input, Flatten +from keras.layers import LSTM, GRU, Dropout +from keras.models import Model +import keras +from keras.utils import plot_model + + +def stopwordslist(filepath): + ''' + 加载停用词文件并转成列表 + :param filepath: 停用词文件路径 + :return: 停用词列表 + ''' + stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] + return stopwords + +def seg_sentence(sentence): + ''' + 对句子进行分词 + :param sentence: 句子 + :return: 分词后的列表 + ''' + sentence_seged = jieba.cut(sentence.strip()) + stopwords = stopwordslist('./stopwords.txt') # 这里加载停用词的路径 + outstr = '' + for word in sentence_seged: + if word not in stopwords: + if word != '\t': + outstr += word + outstr += " " + return list(jieba.cut(outstr)) + +def filter_uselessword(seg_list, uselesswords=[' ', '\r\n', '\n']): + ''' + 过滤初步分词后的无用符号,比如空格,\n等等 + :param seg_list: 初步分词后产生的词的列表 + :param uselesswords: 无用符号的列表 + :return: 过滤后的分词列表 + ''' + for word in uselesswords: + b = filter(lambda x: x != word, seg_list) + seg_list = list(b) + return seg_list + +def build_corpus(raw_dataframe): + ''' + 将预期输出和实际输出分别word2vec + :param path: 数据表的dataframe + :return: word2vec + ''' + word_set = SortedSet() + want_answer_list = [] + infact_answer_list = [] + for i in range(raw_dataframe.shape[0]): + want_answer = str(raw_dataframe.iloc[i]['预期输出']) + infact_answer = str(raw_dataframe.iloc[i]['实际输出']) + tmp = [] + for word in filter_uselessword(seg_sentence(want_answer)): + word_set.add(word) + tmp.append(word) + want_answer_list.append(tmp) + tmp = [] + for word in filter_uselessword(seg_sentence(infact_answer)): + word_set.add(word) + tmp.append(word) + infact_answer_list.append(tmp) + + # 0号位置空出来 + word_set = list(word_set) + word_set.insert(0, ' ') + + # padding好的预期输出vec和实际输出vec + want_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set))) + infact_answer_corpus = np.zeros((raw_dataframe.shape[0], len(word_set))) + + # 预期输出中的词换成词在词典中的索引 + for i in range(len(want_answer_list)): + for j in range(len(want_answer_list[i])): + if want_answer_list[i][j] not in word_set: + want_answer_corpus[i][j] = len(word_set) + else: + want_answer_corpus[i][j] = word_set.index(want_answer_list[i][j]) + + # 实际输出中的词换成词在词典中的索引 + for i in range(len(infact_answer_list)): + for j in range(len(infact_answer_list[i])): + if infact_answer_list[i][j] not in word_set: + infact_answer_corpus[i][j] = len(word_set) + else: + infact_answer_corpus[i][j] = word_set.index(infact_answer_list[i][j]) + + return want_answer_corpus, infact_answer_corpus + + +def label2onehot(label): + ''' + label转成onehot + :param label: 是否正确的series + :return:onehot + ''' + onehot = np.zeros((len(label), 2)) + for i, l in enumerate(label): + if int(l) == 1: + onehot[i, 1] = 1 + else: + onehot[i, 0] = 1 + return onehot + + + +if __name__ == '__main__': + df = pd.read_excel('./预期输出与实际输出数据表.xlsx') + want_answer_corpus, infact_answer_corpus = build_corpus(df) + onehot = label2onehot(df['是否正确']) + x_train_1, x_test_1, y_train, y_test = train_test_split(want_answer_corpus, onehot, random_state=2333) + x_train_2, x_test_2, _, _ = train_test_split(infact_answer_corpus, onehot, random_state=2333) + + inputs_want_answer = Input(shape=(len(want_answer_corpus[0]), ), name='want_answer_input') + inputs_infact_answer = Input(shape=(len(infact_answer_corpus[0]), ), name='infact_answer_input') + x_1 = Embedding(len(want_answer_corpus[0]), 64, name='want_answer_embedding')(inputs_want_answer) + x_2 = Embedding(len(infact_answer_corpus[0]), 64, name='infact_answer_embedding')(inputs_infact_answer) + x_1 = GRU(64, dropout=0.5, return_sequences=0.2)(x_1) + x_2 = GRU(64, dropout=0.5, return_sequences=0.2)(x_2) + x = keras.layers.concatenate([x_1, x_2]) + x = Flatten()(x) + x = Dense(64, activation='relu')(x) + predictions = Dense(2, activation='softmax')(x) + model = Model(inputs=[inputs_want_answer, inputs_infact_answer], outputs=predictions) + + # plot_model(model, to_file='model.png') + + model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) + + print('Train...') + model.fit([x_train_1, x_train_2], y_train, + batch_size=16, + epochs=60) + score, acc = model.evaluate([x_test_1, x_test_2], y_test, + batch_size=8) + print('Test score:', score) + print('Test accuracy:', acc) + + + + + + diff --git a/预期输出与实际输出数据表.xlsx b/预期输出与实际输出数据表.xlsx index 877dd4e..1238b05 100644 Binary files a/预期输出与实际输出数据表.xlsx and b/预期输出与实际输出数据表.xlsx differ