#! /usr/bin/env python3
#
def exercise3():

#*****************************************************************************80
#
## exercise3() processes the turtles dataset.
#
#  Discussion:
#
#    The turtles dataset contains a mixture of character and numeric data.
#    It is stored as a CSV file.
#    We will read the data using the pandas function read_csv()
#    and then convert the the numeric columns to a numpy array for processing.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license. 
#
#  Modified:
#
#    14 January 2022
#
#  Author:
#
#    John Burkardt
#
  from os.path import exists
  import matplotlib.pyplot as plt
  import numpy as np
  import os
  import pandas as pd
  import wget

  print ( "exercise3():" )
  print ( "  Get a datafile in csv format from the class website." )
  print ( "  Read the data from the datafile." )
  print ( "  Print the first five lines." )
  print ( "  Convert the numeric columns 3, 4, 5, to a numpy array." )
  print ( "  Compute some statistical measurements." )
  print ( "  When finished, delete the datafile." )
#
#  Get a copy of the datafile from the website.
#
  filename = 'turtles.csv'
  if ( exists ( filename ) ):
    print ( "  Already have a copy of " + filename )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + filename
    filename = wget.download ( url )
#
#  Read the data from the file into a "dataframe".
#
  df = pd.read_csv ( filename )

#
#  Print the first five lines.
#
  print ( '' )
  print ( '  First five lines of df:' )
  print ( '' )
  for i in range ( 0, 5 ):
    print ( df.loc[i] )
#
#  Copy dataframe contents into a numpy array.
#  The numeric columns are labeled "Length", "Width", "Height".
#
  data = df[['Length','Width','Height']].to_numpy ( )

  rows, cols = np.shape ( data )
  print ( '' )
  print ( '  "' + filename + '" contains', rows, 'rows and', cols, 'columns.' )
#
#  Print the first five lines.
#
  print ( '' )
  print ( '  First five lines of data:' )
  print ( '' )
  for i in range ( 0, 5 ):
    print ( '  (%d)  ' % ( i ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( data[i,j] ), end = '' )
    print ( '' )
#
#  Do a histogram of the second column.
#
  plt.clf ( )
  plt.hist ( data[:,0], bins = 20 )
  plt.grid ( True )
  plt.title ( 'turtles data: "Length"' )
  filename2 = 'exercise3.jpg'
  plt.savefig ( filename2 )
  print ( "  Graphics saved as '" + filename2 + "'" )
  plt.show ( )
  plt.close ( )
#
#  Compute statistical data for each column.
#
  s = np.zeros ( [ 6, cols ] )

  for j in range ( 0, cols ):
    s[0,j] = np.min ( data[:,j] )
    s[1,j] = np.max ( data[:,j] )
    s[2,j] = s[1,j] - s[0,j]
    s[3,j] = np.mean ( data[:,j] )
    s[4,j] = np.var ( data[:,j] )
    s[5,j] = np.linalg.norm ( data[:,j] ) / np.sqrt ( cols )

  labels = ( 'Minimum ', 'Maximum ', 'Range   ', 'Mean    ', 'Variance', 'RMS     ' )

  print ( '' )
  print ( '  Statistics for data:' )
  print ( '' )
  for i in range ( 0, 6 ):
    print ( '  %8s  ' % ( labels[i] ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( s[i,j] ), end = '' )
    print ( '' )
#
#  Discard the data file.
#
  print ( "" )
  print ( "  Deleting dataset file:" )
  os.remove ( filename )
#
#  Terminate.
#
  print ( "" )
  print ( "exercise3:" )
  print ( "  Normal end of execution." )

  return

if ( __name__ == "__main__" ):
  exercise3 ( )