from __future__ import print_function import keras from keras.datasets import imdb # import the imdb data set this is native to keras from keras import losses from keras import metrics from keras import models from keras import layers from keras import optimizers import matplotlib.pyplot as plt import numpy as np import pdb #Michael Schneier 11/06/2019 #This program trains a neural network for binary classification of movie reviews on the Keras IMDB data set #This program is based off the code in the third chapter of Deep Learning with Python by Franois Chollet. #A Jupyter notebook variant written by Franois Chollet can be found #on GitHub at https://github.com/fchollet/deep-learning-with-python-notebooks # MIT License # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. #train_data, test_data - list of reviews in the form of word indicies 10000 is the the 10000 most frequently occuring words in the data set #test_data, test_labels - lists of 0s and 1s, 0 is a negative review, 1 is a positive review tot_words = 10000 (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=tot_words) #Load training and test data pdb.set_trace() #look at train_data[0], train_labels[0] and max([max(sequence) for sequence in train_data]) word_index = imdb.get_word_index() reverse_word_index = dict( [(value, key) for (key, value) in word_index.items()]) decoded_review = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) print(reverse_word_index) pdb.set_trace() #Get data in proper format so that we can use it to train a network #One-hot encode def vectorize_sequences(sequences, dimension=tot_words ): # Create an all-zero matrix of shape (len(sequences), dimension) results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. # set specific indices of results[i] to 1s return results #Vectorize Data x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') pdb.set_trace() # look at sample after conversion x_train[0] ###################### ###Set up the Model### ##################### act_fun = 'relu' #The activation function we use for our network. Note because of the nature of this problem we will always use a sigmoid for the output layer model = models.Sequential() model.add(layers.Dense(16,activation = act_fun, input_shape = (tot_words ,))) #this creates a layer 16 wide the input shape command tells us the size of our input model.add(layers.Dense(16,activation = act_fun,)) # We add another layer of the same size model.add(layers.Dense(1,activation = 'sigmoid',)) # our final layer, outputs the scalar prediction regarding the seniment of the review, sigmoid can be interpreted as a probability ######################### ###Configure Optimizer### ######################### model.compile(optimizer='rmsprop', #We use cross entropy for the loss and rmsprop for the optimizer, these are prepackaged in Keras loss='binary_crossentropy', metrics=['accuracy']) ###We can configure the optimizer###### # model.compile(optimizer=optimizers.RMSprop(lr=0.001), # loss='binary_crossentropy', # metrics=['accuracy']) ###We can also configure the loss and the metrics###### # model.compile(optimizer=optimizers.RMSprop(lr=0.001), # loss=losses.binary_crossentropy, # metrics=[metrics.binary_accuracy]) ############################## ###Validation Set############# ############################# x_val = x_train[:tot_words ] partial_x_train = x_train[tot_words :] y_val = y_train[:tot_words] partial_y_train = y_train[tot_words:] tot_batch = 512 ###Train the model################ history = model.fit(partial_x_train, partial_y_train, epochs=20, #Total number of times we iterate over the sample data batch_size=tot_batch, #Number we train at a time validation_data=(x_val, y_val)) #We monitor our accuracy against these as we train #History is an object that contains data about what happened during training history_dict = history.history history_dict.keys() pdb.set_trace() #look at history_dict.keys() acc = history.history['accuracy'] val_acc = history.history['val_accuracy'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(acc) + 1) ############################################## #Plot some of our metrics to see what happened ############################################## # "bo" is for "blue dot" plt.plot(epochs, loss, 'bo', label='Training loss') # b is for "solid blue line" plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and Validation Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() # clear figure acc_values = history_dict['accuracy'] val_acc_values = history_dict['val_accuracy'] plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and Validation Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() model.fit(x_train, y_train, epochs=4, batch_size=tot_batch) results = model.evaluate(x_test, y_test) #returns loss value and metrics value of model print("The accuacy of our network:" + str(results[1])) #results[0] has the loss value not interested in this case pdb.set_trace() ###Retrain network with less epochs############################################ model = models.Sequential() model.add(layers.Dense(16, activation= act_fun, input_shape=(tot_words,))) model.add(layers.Dense(16, activation= act_fun)) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, epochs=4, batch_size=tot_batch) results = model.evaluate(x_test, y_test) #returns loss value and metrics value of model print("The accuracy of our network:" + str(results[1])) model.predict(x_test) ### This gives us the probability our model returns for a review being positive pdb.set_trace() #print model.predict confident for some not for others ######Things to Test########################################################### #1. Try adding more hidden layers for the 4 epoch case and the 20 epoch case #2. Try using wider and shallower networks #3. Try using the mse loss function instead of binary_crossentropy #4. Try using tanh and sigmoidd activation inetead of relu #5 Look up and try other optimizers besides rmsprop #6 Try messing with the batch size, how does the convergence behave?