#! /usr/bin/env python3 # def exercise4(): #*****************************************************************************80 # ## exercise4() processes the diabetes dataset, which has missing values. # # Discussion: # # The diabetes dataset is entirely numeric, but is stored as a CSV file. # A number of feature values are set to 0, indicating that the actual # value is missing. # # The features are: # 0. Number of times pregnant. # 1. Plasma glucose concentration a 2 hours in an oral glucose tolerance test. # 2. Diastolic blood pressure (mm Hg). # 3. Triceps skinfold thickness (mm). # 4. 2-Hour serum insulin (mu U/ml). # 5. Body mass index (weight in kg/(height in m)^2). # 6. Diabetes pedigree function. # 7. Age (years). # 8. Class variable (0 or 1). # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 14 January 2022 # # Author: # # John Burkardt # from os.path import exists import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import wget print ( "exercise4():" ) print ( " Get a datafile in csv format from the class website." ) print ( " Read the data from the datafile into a pandas() dataframe." ) print ( " Print the first five lines." ) print ( " Copy the data from the dataframe into a numpy() array." ) print ( " Make a histogram of some of the features." ) print ( " Compute some statistical measurements." ) print ( " Standardize the data, and recompute the statistics." ) print ( " When finished, delete the datafile." ) # # Get a copy of the datafile from the website. # filename = 'diabetes.csv' if ( exists ( filename ) ): print ( " Already have a copy of " + filename ) else: print ( " Downloading dataset file:" ) url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + filename filename = wget.download ( url ) # # Read the data from the file into a "dataframe". # df = pd.read_csv ( filename ) print ( df.describe ( ) ) # # Print the headers. # print ( "" ) print ( " Dataframe headers:" ) print ( "" ) for col in df.columns: print ( col ) # # Columns 1 through 5 should never be zero. # missing = (df[["Glucose","Diastolic","Triceps","Insulin","BMI"]]==0).sum() print ( "" ) print ( " Number of missing values in columns 1-5:" ) print ( missing ) # # Replace 0 values with NaN # df[["Glucose","Diastolic","Triceps","Insulin","BMI"]] = df[["Glucose","Diastolic","Triceps","Insulin","BMI"]].replace(0, np.nan) # # Drop all records with any NaN value. # df.dropna ( inplace = True ) # # Copy dataframe contents into a numpy array. # (We can only do this simple operation because the entire array is numeric.) # data = df.to_numpy ( ) rows, cols = np.shape ( data ) print ( '' ) print ( ' "' + filename + '" contains', rows, 'rows and', cols, 'columns.' ) # # Do a histogram of the second column. # plt.clf ( ) plt.hist ( data[:,3], bins = 20 ) plt.grid ( True ) plt.title ( 'diabetes data: Triceps' ) filename2 = 'exercise4.jpg' plt.savefig ( filename2 ) print ( " Graphics saved as '" + filename2 + "'" ) plt.show ( ) plt.close ( ) # # Compute statistical data for each column. # s = np.zeros ( [ 6, cols ] ) for j in range ( 0, cols ): s[0,j] = np.min ( data[:,j] ) s[1,j] = np.max ( data[:,j] ) s[2,j] = s[1,j] - s[0,j] s[3,j] = np.mean ( data[:,j] ) s[4,j] = np.var ( data[:,j] ) s[5,j] = np.linalg.norm ( data[:,j] ) / np.sqrt ( cols ) labels = ( 'Minimum ', 'Maximum ', 'Range ', 'Mean ', 'Variance', 'RMS ' ) print ( '' ) print ( ' Statistics for data:' ) print ( '' ) for i in range ( 0, 6 ): print ( ' %8s ' % ( labels[i] ), end = '' ) for j in range ( 0, cols ): print ( ' %8.2g' % ( s[i,j] ), end = '' ) print ( '' ) # # Discard the data file. # print ( "" ) print ( " Deleting dataset file:" ) os.remove ( filename ) # # Terminate. # print ( "" ) print ( "exercise4:" ) print ( " Normal end of execution." ) return if ( __name__ == "__main__" ): exercise4 ( )