#! /usr/bin/env python3
#
def exercise1():

#*****************************************************************************80
#
## exercise1() processes the hw_data dataset.
#
#  Discussion:
#
#    The data is stored as a text file, with spaces used as the delimiter.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license. 
#
#  Modified:
#
#    17 January 2022
#
#  Author:
#
#    John Burkardt
#
  from os.path import exists
  import matplotlib.pyplot as plt
  import numpy as np
  import os
  import wget

  print ( "exercise1():" )
  print ( "  Get a datafile in text format from the class website." )
  print ( "  Read the data from the datafile." )
  print ( "  Print the first five lines." )
  print ( "  Compute some statistical measurements." )
  print ( "  Normalize the data, and recompute the statistics." )
  print ( "  When finished, delete the datafile." )
#
#  Get a copy of the datafile from the website.
#
  datafile = 'hw_data.txt'
  if ( exists ( datafile ) ):
    print ( "  Already have a copy of " + datafile )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile
    datafile = wget.download ( url )
#
#  Read the data from the file.
#
  data = np.loadtxt ( datafile )
  rows, cols = np.shape ( data )
  print ( '' )
  print ( '  "' + datafile + '" contains', rows, 'rows and', cols, 'columns.' )
#
#  Remove the first column, which is just an index, and adjust the number of columns.
#
  data = data[:,1:3]
  cols = 2
#
#  There are 25000 entries.  Let's just do the first segment.
#
  rows = 350
  data = data[0:rows,:]
#
#  Print the first five lines.
#
  print ( '' )
  print ( '  First five lines of data:' )
  print ( '' )
  for i in range ( 0, 5 ):
    print ( '  (%d)  ' % ( i ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( data[i,j] ), end = '' )
    print ( '' )
#
#  Compute statistical data.
#
  s = np.zeros ( [ 7, cols ] )

  for j in range ( 0, cols ):
    s[0,j] = np.min ( data[:,j] )
    s[1,j] = np.max ( data[:,j] )
    s[2,j] = s[1,j] - s[0,j]
    s[3,j] = np.mean ( data[:,j] )
    s[4,j] = np.var ( data[:,j] )
    s[5,j] = np.sqrt ( s[4,j] )
    s[6,j] = np.linalg.norm ( data[:,j] ) / np.sqrt ( cols )

  labels = ( 'Minimum ', 'Maximum ', 'Range   ', 'Mean    ', 'Variance', \
    'STD     ', 'RMS     ' )

  print ( '' )
  print ( '  Statistics for data:' )
  print ( '' )
  for i in range ( 0, 7 ):
    print ( '  %8s  ' % ( labels[i] ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( s[i,j] ), end = '' )
    print ( '' )
#
#  Standardize the data.
#
  data2 = np.zeros ( [ rows, cols ] )
  for j in range ( 0, cols ):
    data2[:,j] = ( data[:,j] - s[3,j] ) / s[5,j]
#
#  Create a scatter plot.
#
  plt.scatter ( data2[:,0], data2[:,1] )
  tc = np.linspace ( 0, 2.0 * np.pi, 51 )
  xc = np.cos ( tc )
  yc = np.sin ( tc )
  plt.plot ( xc, yc, 'r-', linewidth = 2 )
  plt.plot ( 2.0*xc, 2.0*yc, 'r-', linewidth = 2 )
  plt.plot ( 3.0*xc, 3.0*yc, 'r-', linewidth = 2 )
  plt.xlabel ( 'Height' )
  plt.ylabel ( 'Weight' )
  plt.title ( '(Height,Weight) standardized' )
  plt.grid ( True )
  plt.axis ( 'equal' )
  plotfile = 'exercise1.jpg'
  plt.savefig ( plotfile )
  print ( '  Graphics saved as "' + plotfile + '"' )
  plt.show ( )
  plt.close ( )
#
#  Compute the covariance matrix.
#
  cov = np.cov ( data2, rowvar = False )
  print ( '' )
  print ( '  Covariance matrix for data2:' )
  print ( '' )
  print ( cov )
#
#  Discard the data file.
#
  print ( "" )
  print ( "  Deleting dataset file:" )
  os.remove ( datafile )
#
#  Terminate.
#
  print ( "" )
  print ( "exercise1():" )
  print ( "  Normal end of execution." )

  return

if ( __name__ == "__main__" ):
  exercise1 ( )