#! /usr/bin/env python3
#
def india_test ( ):

  import pandas as pd
#
#  Read "india_data.csv", containing columns of demographic data 
#  on the 36 states and union territories (UTs) of India.
#
  print ( '' )
  print ( "  Read file into DataFrame:" )
  print ( "    df = pd.read_csv ( 'india_data.csv', index_col = 0 )" )

  df = pd.read_csv ( 'india_data.csv', index_col = 0 )
  input ( "RETURN" )
#
#  The DataFrame contains an Index of State/UT name
#
  print ( '' )
  print ( '  The index is the State name' )
  print ( '    print ( df.index )' )

  print ( df.index )
  input ( "RETURN" )
#
#  The DataFrame contains columns labeled as follows:
#
  print ( '' )
  print ( '  Each column has a label:' )
  print ( '    print ( df.columns )' )
  print ( df.columns )
  input ( "RETURN" )
#
#  We can quickly inspect the DataFrame with df.head(n), 
#  which outputs the first n rows (or five rows if n is not specified):
#
  print ( '' )
  print ( '  df.head() prints an outline of the DataFrame:' )

  print ( df.head ( ) )
  input ( "RETURN" )
#
#  pandas makes it straightforward to compute new columns for our DataFrame:
#
  print ( '' )
  print ( '  Add a new column, Population = Males + Females' )
  print ( "    df['Population'] = df['Male Population'] + df['Female Population']" )

  df['Population'] = df['Male Population'] + df['Female Population']
  total_pop = df['Population'].sum()
  print ( "Total population = df['Population'].sum():", total_pop )
  input ( "RETURN" )
#
#  Compute population density, then print population density of West Bengal.
#
  print ( '' )
  print ( '  Add population density:' )
  print ( "    df['Population Density (km-2)'] = df['Population'] / df['Area (km2)']" )

  df['Population Density (km-2)'] = df['Population'] / df['Area (km2)']
  print ( "  print ( df.loc['West Bengal', 'Population Density (km-2)'] " )
  print ( df.loc['West Bengal', 'Population Density (km-2)'] )
  input ( "RETURN" )                  
#
#  Compute the mean population density.
#
  print ( '' )
  print ( "  Mean population density: total_pop / df['Area (km2)'].sum()" )
  print ( "    print ( total_pop / df['Area (km2)'].sum()" )
  print ( total_pop / df['Area (km2)'].sum() )
  input ( "RETURN" )
#
#  Maximum and minimum values are obtained in the same way as in NumPy, for example:
#
  print ( '' )
  print ( "  Lowest male literacy rate?: df['Male Literacy (%)'].min()" )
  print ( "    print ( df['Male Literacy (%)'].min() )" )
  print ( df['Male Literacy (%)'].min() )
  input ( "RETURN" )
#
#  Perhaps more usefully, idxmin and idxmax return the index label(s) 
#  of the minimum and maximum values, respectively.
#
  print ( '' )
  print ( "  Largest state?: df['Area (km2)'].idxmax()" )
  print ( "    df['Area (km2)'].idxmax()" )
  print ( df['Area (km2)'].idxmax() )
  input ( "RETURN" )
#
#  Naturally, the value returned by idxmax() can be passed to df.loc to obtain 
#  the entire row. For example, the row corresponding to the most densely 
#  populated State / UT:
#
  print ( '' )
  print ( "  Greatest population density?: " )
  print ( "    print ( df['Population Density (km-2)'].idxmax()" )
  print ( df['Population Density (km-2)'].idxmax() )

  print ( '' )
  print ( '  Print all data for greatest population density state:' )
  print ( "    print ( df.loc[df['Population Density (km-2)'].idxmax()] )" )

  print ( df.loc[df['Population Density (km-2)'].idxmax()] )
  input ( "RETURN" )
#
#  Correlation statistics between DataFrames or Series can be 
#  calculated with the corr() function.
#  In this case (two columns of data being compared), a single correlation
#  coefficient is produced.  More generally, the correlation matrix is 
#  returned as a new DataFrame.
#
  print ( '' )
  print ( '  Correlation between literacy and fertility:' )
  print ( "    print ( df['Female Literacy (%)'].corr( df['Fertility Rate'] ) )" )

  print ( df['Female Literacy (%)'].corr( df['Fertility Rate'] )  )
  input ( "RETURN" )
#
#  pandas can quickly produce simple, labeled plots and charts from a 
#  DataFrame with a family of df.plot methods.  By default, these use 
#  Matplotlib.
#
  print ( '' )
  print ( '  Scatterplot of literacy and fertility:' )
  print ( "    df.plot.scatter ( 'Female Literacy (%)', 'Fertility Rate' )" )

  import matplotlib.pyplot as plt
  df.plot.scatter ( 'Female Literacy (%)', 'Fertility Rate' )
  filename = 'correlation_example.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "' + filename + '"' )
  plt.show ( )
  plt.close ( )
  input ( "RETURN" )
#
#  Compute linear fit.
#  One literacy value was NaN.  Drop that record
#
  print ( '' )
  print ( '  Drop record with NaN:' )
  print ( '    df = df.dropna ( )' )

  df = df.dropna ( )

  print ( '' )
  print ( '  Compute linear fit: fertility = a * literacy + b:' )

  x = df[ 'Female Literacy (%)'].values
  y = df[ 'Fertility Rate' ].values
  a, b = llsq ( x, y ) 

  print ( '  a = ', a, '  b = ', b )
  input ( "RETURN" )

  import numpy as np
  xmin = np.min ( x )
  xmax = np.max ( x )

  df.plot.scatter ( 'Female Literacy (%)', 'Fertility Rate' )
  plt.plot ( [xmin,xmax], [a*xmin+b, a*xmax+b], 'r-' )
  filename = 'correlation_fitted.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "' + filename + '"' )
  plt.show ( )
  plt.close ( )

  return

def llsq ( x, y ):

  import numpy as np

  xbar = np.mean ( x )
  ybar = np.mean ( y )

  xy = np.dot ( x, y - ybar )
  xx = np.dot ( x, x - xbar )

  a = xy / xx
  b = ybar - a * xbar

  return a, b

if ( __name__ == "__main__" ):
  india_test ( )