#! /usr/bin/env python3
#
def exercise2():

#*****************************************************************************80
#
## exercise2() processes the wine dataset.
#
#  Discussion:
#
#    The wine dataset is entirely numeric, but is stored as a CSV file.
#    The easiest way to read the data is to use the pandas function read_csv()
#    and then convert the dataframe into a numpy array.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license. 
#
#  Modified:
#
#    14 January 2022
#
#  Author:
#
#    John Burkardt
#
  from os.path import exists
  import matplotlib.pyplot as plt
  import numpy as np
  import os
  import pandas as pd
  import wget

  print ( "exercise2():" )
  print ( "  Get a datafile in csv format from the class website." )
  print ( "  Read the data from the datafile into a pandas() dataframe." )
  print ( "  Print the first five lines." )
  print ( "  Copy the data from the dataframe into a numpy() array." )
  print ( "  Make a histogram of some of the features." )
  print ( "  Compute some statistical measurements." )
  print ( "  Standardize the data, and recompute the statistics." )
  print ( "  When finished, delete the datafile." )
#
#  Get a copy of the datafile from the website.
#
  filename = 'wine.csv'
  if ( exists ( filename ) ):
    print ( "  Already have a copy of " + filename )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + filename
    filename = wget.download ( url )
#
#  Read the data from the file into a "dataframe".
#
  df = pd.read_csv ( filename )
#
#  Copy dataframe contents into a numpy array.
#  (We can only do this simple operation because the entire array is numeric.)
#
  data = df.to_numpy ( )

  rows, cols = np.shape ( data )
  print ( '' )
  print ( '  "' + filename + '" contains', rows, 'rows and', cols, 'columns.' )
#
#  Print the first five lines.
#
  print ( '' )
  print ( '  First five lines of data:' )
  print ( '' )
  for i in range ( 0, 5 ):
    print ( '  (%d)  ' % ( i ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( data[i,j] ), end = '' )
    print ( '' )
#
#  Do a histogram of the second column.
#
  plt.clf ( )
  plt.hist ( data[:,1], bins = 20 )
  plt.grid ( True )
  plt.title ( 'wine data: column[1]' )
  filename2 = 'exercise2.jpg'
  plt.savefig ( filename2 )
  print ( "  Graphics saved as '" + filename2 + "'" )
  plt.show ( )
  plt.close ( )
#
#  Compute statistical data for each column.
#
  s = np.zeros ( [ 6, cols ] )

  for j in range ( 0, cols ):
    s[0,j] = np.min ( data[:,j] )
    s[1,j] = np.max ( data[:,j] )
    s[2,j] = s[1,j] - s[0,j]
    s[3,j] = np.mean ( data[:,j] )
    s[4,j] = np.var ( data[:,j] )
    s[5,j] = np.linalg.norm ( data[:,j] ) / np.sqrt ( cols )

  labels = ( 'Minimum ', 'Maximum ', 'Range   ', 'Mean    ', 'Variance', 'RMS     ' )

  print ( '' )
  print ( '  Statistics for data:' )
  print ( '' )
  for i in range ( 0, 6 ):
    print ( '  %8s  ' % ( labels[i] ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( s[i,j] ), end = '' )
    print ( '' )
#
#  Standardize the data.
#
  data2 = np.zeros ( [ rows, cols ] )
  for j in range ( 0, cols ):
    data2[:,j] = ( data[:,j] - s[3,j] ) / np.sqrt ( s[4,j] )
#
#  Recompute statistical data for each column.
#
  s = np.zeros ( [ 6, cols ] )

  for j in range ( 0, cols ):
    s[0,j] = np.min ( data2[:,j] )
    s[1,j] = np.max ( data2[:,j] )
    s[2,j] = s[1,j] - s[0,j]
    s[3,j] = np.mean ( data2[:,j] )
    s[4,j] = np.var ( data2[:,j] )
    s[5,j] = np.linalg.norm ( data2[:,j] ) / np.sqrt ( cols )

  labels = ( 'Minimum ', 'Maximum ', 'Range   ', 'Mean    ', 'Variance', 'RMS     ' )

  print ( '' )
  print ( '  Statistics for normalized data:' )
  print ( '' )
  for i in range ( 0, 6 ):
    print ( '  %8s  ' % ( labels[i] ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( s[i,j] ), end = '' )
    print ( '' )
#
#  Discard the data file.
#
  print ( "" )
  print ( "  Deleting dataset file:" )
  os.remove ( filename )
#
#  Terminate.
#
  print ( "" )
  print ( "exercise2:" )
  print ( "  Normal end of execution." )

  return

if ( __name__ == "__main__" ):
  exercise2 ( )