#! /usr/bin/env python3
#
def exercise4():

#*****************************************************************************80
#
## exercise4() processes the diabetes dataset, which has missing values.
#
#  Discussion:
#
#    The diabetes dataset is entirely numeric, but is stored as a CSV file.
#    A number of feature values are set to 0, indicating that the actual
#    value is missing.
#
#    The features are:
#      0. Number of times pregnant.
#      1. Plasma glucose concentration a 2 hours in an oral glucose tolerance test.
#      2. Diastolic blood pressure (mm Hg).
#      3. Triceps skinfold thickness (mm).
#      4. 2-Hour serum insulin (mu U/ml).
#      5. Body mass index (weight in kg/(height in m)^2).
#      6. Diabetes pedigree function.
#      7. Age (years).
#      8. Class variable (0 or 1).
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license. 
#
#  Modified:
#
#    14 January 2022
#
#  Author:
#
#    John Burkardt
#
  from os.path import exists
  import matplotlib.pyplot as plt
  import numpy as np
  import os
  import pandas as pd
  import wget

  print ( "exercise4():" )
  print ( "  Get a datafile in csv format from the class website." )
  print ( "  Read the data from the datafile into a pandas() dataframe." )
  print ( "  Print the first five lines." )
  print ( "  Copy the data from the dataframe into a numpy() array." )
  print ( "  Make a histogram of some of the features." )
  print ( "  Compute some statistical measurements." )
  print ( "  Standardize the data, and recompute the statistics." )
  print ( "  When finished, delete the datafile." )
#
#  Get a copy of the datafile from the website.
#
  filename = 'diabetes.csv'
  if ( exists ( filename ) ):
    print ( "  Already have a copy of " + filename )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + filename
    filename = wget.download ( url )
#
#  Read the data from the file into a "dataframe".
#
  df = pd.read_csv ( filename )

  print ( df.describe ( ) )
#
#  Print the headers.
#
  print ( "" )
  print ( "  Dataframe headers:" )
  print ( "" )
  for col in df.columns:
    print ( col )
#
#  Columns 1 through 5 should never be zero.
#
  missing = (df[["Glucose","Diastolic","Triceps","Insulin","BMI"]]==0).sum()
  print ( "" )
  print ( "  Number of missing values in columns 1-5:" )
  print ( missing )
#
#  Replace 0 values with NaN
#
  df[["Glucose","Diastolic","Triceps","Insulin","BMI"]] = df[["Glucose","Diastolic","Triceps","Insulin","BMI"]].replace(0, np.nan)
#
#  Drop all records with any NaN value.
#
  df.dropna ( inplace = True )
#
#  Copy dataframe contents into a numpy array.
#  (We can only do this simple operation because the entire array is numeric.)
#
  data = df.to_numpy ( )

  rows, cols = np.shape ( data )
  print ( '' )
  print ( '  "' + filename + '" contains', rows, 'rows and', cols, 'columns.' )
#
#  Do a histogram of the second column.
#
  plt.clf ( )
  plt.hist ( data[:,3], bins = 20 )
  plt.grid ( True )
  plt.title ( 'diabetes data: Triceps' )
  filename2 = 'exercise4.jpg'
  plt.savefig ( filename2 )
  print ( "  Graphics saved as '" + filename2 + "'" )
  plt.show ( )
  plt.close ( )
#
#  Compute statistical data for each column.
#
  s = np.zeros ( [ 6, cols ] )

  for j in range ( 0, cols ):
    s[0,j] = np.min ( data[:,j] )
    s[1,j] = np.max ( data[:,j] )
    s[2,j] = s[1,j] - s[0,j]
    s[3,j] = np.mean ( data[:,j] )
    s[4,j] = np.var ( data[:,j] )
    s[5,j] = np.linalg.norm ( data[:,j] ) / np.sqrt ( cols )

  labels = ( 'Minimum ', 'Maximum ', 'Range   ', 'Mean    ', 'Variance', 'RMS     ' )

  print ( '' )
  print ( '  Statistics for data:' )
  print ( '' )
  for i in range ( 0, 6 ):
    print ( '  %8s  ' % ( labels[i] ), end = '' )
    for j in range ( 0, cols ):
      print ( '  %8.2g' % ( s[i,j] ), end = '' )
    print ( '' )
#
#  Discard the data file.
#
  print ( "" )
  print ( "  Deleting dataset file:" )
  os.remove ( filename )
#
#  Terminate.
#
  print ( "" )
  print ( "exercise4:" )
  print ( "  Normal end of execution." )

  return

if ( __name__ == "__main__" ):
  exercise4 ( )