#! /usr/bin/env python3
#
def exercise1():

#*****************************************************************************80
#
## exercise1() processes the homes dataset.
#
#  Discussion:
#
#    The data is stored as a text file, with spaces used as the delimiter.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license. 
#
#  Modified:
#
#    14 January 2022
#
#  Author:
#
#    John Burkardt
#
  from os.path import exists
  import numpy as np
  import os
  import wget

  print ( "exercise1():" )
  print ( "  Get a datafile in text format from the class website." )
  print ( "  Read the data from the datafile." )
  print ( "  Print the first five lines." )
  print ( "  Compute some statistical measurements." )
  print ( "  Normalize the data, and recompute the statistics." )
  print ( "  When finished, delete the datafile." )
#
#  Get a copy of the datafile from the website.
#
  filename = 'homes_data.txt'
  if ( exists ( filename ) ):
    print ( "  Already have a copy of " + filename )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + filename
    filename = wget.download ( url )
#
#  Read the data from the file.
#
  data = np.loadtxt ( filename )
  rows, cols = np.shape ( data )

  print ( '' )
  print ( '  "' + filename + '" contains', rows, 'rows and', cols, 'columns.' )
#
#  Print the first five lines.
#
  print ( '' )
  print ( '  First five lines of data:' )
  print ( '' )
  for i in range ( 0, 5 ):
    print ( '  (%d)  ' % ( i ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( data[i,j] ), end = '' )
    print ( '' )
#
#  Compute statistical data for each column.
#
  s = np.zeros ( [ 6, cols ] )

  for j in range ( 0, cols ):
    s[0,j] = np.min ( data[:,j] )
    s[1,j] = np.max ( data[:,j] )
    s[2,j] = s[1,j] - s[0,j]
    s[3,j] = np.mean ( data[:,j] )
    s[4,j] = np.var ( data[:,j] )
    s[5,j] = np.linalg.norm ( data[:,j] ) / np.sqrt ( cols )

  labels = ( 'Minimum ', 'Maximum ', 'Range   ', 'Mean    ', 'Variance', 'RMS     ' )

  print ( '' )
  print ( '  Statistics for data:' )
  print ( '' )
  for i in range ( 0, 6 ):
    print ( '  %8s  ' % ( labels[i] ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( s[i,j] ), end = '' )
    print ( '' )
#
#  Normalize the data.
#
  data2 = np.zeros ( [ rows, cols ] )
  for j in range ( 0, cols ):
    data2[:,j] = ( data[:,j] - s[0,j] ) / s[2,j]
#
#  Recompute statistical data for each column.
#
  s = np.zeros ( [ 6, cols ] )

  for j in range ( 0, cols ):
    s[0,j] = np.min ( data2[:,j] )
    s[1,j] = np.max ( data2[:,j] )
    s[2,j] = s[1,j] - s[0,j]
    s[3,j] = np.mean ( data2[:,j] )
    s[4,j] = np.var ( data2[:,j] )
    s[5,j] = np.linalg.norm ( data2[:,j] ) / np.sqrt ( cols )

  labels = ( 'Minimum ', 'Maximum ', 'Range   ', 'Mean    ', 'Variance', 'RMS     ' )

  print ( '' )
  print ( '  Statistics for normalized data:' )
  print ( '' )
  for i in range ( 0, 6 ):
    print ( '  %8s  ' % ( labels[i] ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( s[i,j] ), end = '' )
    print ( '' )
#
#  Discard the data file.
#
  print ( "" )
  print ( "  Deleting dataset file:" )
  os.remove ( filename )
#
#  Terminate.
#
  print ( "" )
  print ( "exercise1():" )
  print ( "  Normal end of execution." )

  return

if ( __name__ == "__main__" ):
  exercise1 ( )