#! /usr/bin/env python3
#
def imdb ( ):

#*****************************************************************************80
#
## imdb() uses keras to classify movie reviews.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    12 November 2019
#
#  Author:
#
#    Original version by Francois Chollet;
#    This version by John Burkardt.
#
#  Reference:
#
#    Francois Chollet,
#    Deep Learning with Python,
#    Manning, 2018,
#    ISBN: 9781617294433.
#
  import numpy as np
  import platform
  import tensorflow
#
#  Use the word_num most frequent words.
#
# word_num = 10000
  word_num = 1000

  print ( '' )
  print ( 'imdb():' )
  print ( '  python version:     ' + platform.python_version ( ) )
  print ( '  numpy version:      ' + np.version.version )
# print ( '  keras version:      ' + keras.__version__ )
  print ( '  tensorflow version: ' + tensorflow.__version__ )
  print ( '' )
  print ( '  Use a neural network to classify movie reviews as negative (0)' )
  print ( '  or positive (1) based on the usage of the ', word_num )
  print ( '  most common words.' )
#
#  Import the movie review dataset.
#
  from tensorflow.keras.datasets import imdb
#
#  Load the movie review dataset.
#
  ( train_data, train_labels ), ( test_data, test_labels ) = \
    imdb.load_data ( num_words = word_num )
#
#  Print a sample of the data and its label.
#
  print ( '' )
  print ( '  Sample training data #0:' )
  print ( train_data[0] )
  print ( '  Label for sample training data #0:' )
  print ( train_labels[0] )
#
#  Verify that no word index exceeds word_num.
#
  max_index = max ( max ( sequence ) for sequence in train_data )
  print ( '' )
  print ( '  Maximum index in training data is ', max_index )
#
#  Reformat the movie reviews as 0/1 vectors of length word_num.
#
  x_train = vectorize_sequences ( train_data )
  x_test = vectorize_sequences ( test_data )
#
#  Print the sample of the data after conversion to binary vector.
#
  print ( '' )
  print ( '  Sample training data #0 after vectorization:' )
  print ( train_data[0] )
  print ( '  Label for sample training data #0:' )
  print ( train_labels[0] )
#
#  Also convert label arrays to the proper numeric type.
#
  y_train = np.asarray ( train_labels ).astype('float32')
  y_test = np.asarray ( test_labels ).astype('float32')
#
#  Define the model.
#
  from tensorflow import keras
  from tensorflow.keras import layers
#
#  Describe the model.
#
  model = keras.Sequential( [
    layers.Dense ( 16, activation = 'relu' ),
    layers.Dense ( 16, activation = 'relu' ),
    layers.Dense ( 1, activation = 'sigmoid' )
  ] )
#
#  Compile the model.
#
  model.compile ( 
    optimizer = 'rmsprop',
    loss = 'binary_crossentropy',
    metrics = ['accuracy'] )
#
#  Split the 25,000 data items into 10,000 validation and 15,000 training items.
#
  x_val = x_train[:10000]
  partial_x_train = x_train[10000:]

  y_val = y_train[:10000]
  partial_y_train = y_train[10000:]

#
#  Use the model for training and validation.
#
  history = model.fit ( 
    partial_x_train,
    partial_y_train,
    epochs = 20,
    batch_size = 512,
    validation_data = (x_val, y_val) )
#
#  Report the validation loss and accuracy.
#
  history_dict = history.history

  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  print ( '' )
  print ( '  Model loss and accuracy on validation data:' )
  print ( '    Final validation loss', val_loss[-1] )
  print ( '    Final validation accuracy', val_acc[-1] )
#
#  Apply the model to new test data.
#
  print ( '' )
  print ( '  Test the model:' )
  results = model.evaluate ( x_test, y_test )
#
#  Report the test loss and accuracy.
#
  print ( '' )
  print ( '  Model loss and accuracy on test data:' )
  for i in range ( len ( model.metrics_names ) ):
    print ( model.metrics_names[i], results[i] )
#
#  Demonstrate what the model predicts on some of the test data.
#
  x_predict = model.predict ( x_test )
  x_predict = np.ndarray.flatten ( x_predict )

  print ( '' )
  print ( '  Model predictions on first 20 test movie reviews:' )
  print ( '   # Predict  Actual' )

  for i in range ( 0, 20 ):
    print ( '  %2d  %6.4f    %d' % ( i, x_predict[i], test_labels[i] ) )
#
#  Terminate.
#
  print ( '' )
  print ( 'imdb():' )
  print ( '  Normal end of execution.' )
  return

def vectorize_sequences ( sequences, dimension = 10000 ):

#*****************************************************************************80
#
## vectorize_sequences() replaces each review by a vector.
#
#  Discussion:
#
#    This "vectorize" function replaces each numerically coded movie review
#    by a vector of length 10,000.  The i-th entry of this vector is 1 if
#    the i-th word occurs one or more times in the review.
#
  import numpy as np

  results = np.zeros ( ( len ( sequences ), dimension ) )

  for i, sequence in enumerate ( sequences ):
    for j in sequence:
      results[i,j] = 1.0

  return results

def timestamp ( ):

#*****************************************************************************80
#
## timestamp() prints the date as a timestamp.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    21 August 2019
#
#  Author:
#
#    John Burkardt
#
  import time

  t = time.time ( )
  print ( time.ctime ( t ) )

  return

if ( __name__ == '__main__' ):
  timestamp ( )
  imdb ( )
  timestamp ( )