#! /usr/bin/env python3 # def india_test ( ): import pandas as pd # # Read "india_data.csv", containing columns of demographic data # on the 36 states and union territories (UTs) of India. # print ( '' ) print ( " Read file into DataFrame:" ) print ( " df = pd.read_csv ( 'india_data.csv', index_col = 0 )" ) df = pd.read_csv ( 'india_data.csv', index_col = 0 ) input ( "RETURN" ) # # The DataFrame contains an Index of State/UT name # print ( '' ) print ( ' The index is the State name' ) print ( ' print ( df.index )' ) print ( df.index ) input ( "RETURN" ) # # The DataFrame contains columns labeled as follows: # print ( '' ) print ( ' Each column has a label:' ) print ( ' print ( df.columns )' ) print ( df.columns ) input ( "RETURN" ) # # We can quickly inspect the DataFrame with df.head(n), # which outputs the first n rows (or five rows if n is not specified): # print ( '' ) print ( ' df.head() prints an outline of the DataFrame:' ) print ( df.head ( ) ) input ( "RETURN" ) # # pandas makes it straightforward to compute new columns for our DataFrame: # print ( '' ) print ( ' Add a new column, Population = Males + Females' ) print ( " df['Population'] = df['Male Population'] + df['Female Population']" ) df['Population'] = df['Male Population'] + df['Female Population'] total_pop = df['Population'].sum() print ( "Total population = df['Population'].sum():", total_pop ) input ( "RETURN" ) # # Compute population density, then print population density of West Bengal. # print ( '' ) print ( ' Add population density:' ) print ( " df['Population Density (km-2)'] = df['Population'] / df['Area (km2)']" ) df['Population Density (km-2)'] = df['Population'] / df['Area (km2)'] print ( " print ( df.loc['West Bengal', 'Population Density (km-2)'] " ) print ( df.loc['West Bengal', 'Population Density (km-2)'] ) input ( "RETURN" ) # # Compute the mean population density. # print ( '' ) print ( " Mean population density: total_pop / df['Area (km2)'].sum()" ) print ( " print ( total_pop / df['Area (km2)'].sum()" ) print ( total_pop / df['Area (km2)'].sum() ) input ( "RETURN" ) # # Maximum and minimum values are obtained in the same way as in NumPy, for example: # print ( '' ) print ( " Lowest male literacy rate?: df['Male Literacy (%)'].min()" ) print ( " print ( df['Male Literacy (%)'].min() )" ) print ( df['Male Literacy (%)'].min() ) input ( "RETURN" ) # # Perhaps more usefully, idxmin and idxmax return the index label(s) # of the minimum and maximum values, respectively. # print ( '' ) print ( " Largest state?: df['Area (km2)'].idxmax()" ) print ( " df['Area (km2)'].idxmax()" ) print ( df['Area (km2)'].idxmax() ) input ( "RETURN" ) # # Naturally, the value returned by idxmax() can be passed to df.loc to obtain # the entire row. For example, the row corresponding to the most densely # populated State / UT: # print ( '' ) print ( " Greatest population density?: " ) print ( " print ( df['Population Density (km-2)'].idxmax()" ) print ( df['Population Density (km-2)'].idxmax() ) print ( '' ) print ( ' Print all data for greatest population density state:' ) print ( " print ( df.loc[df['Population Density (km-2)'].idxmax()] )" ) print ( df.loc[df['Population Density (km-2)'].idxmax()] ) input ( "RETURN" ) # # Correlation statistics between DataFrames or Series can be # calculated with the corr() function. # In this case (two columns of data being compared), a single correlation # coefficient is produced. More generally, the correlation matrix is # returned as a new DataFrame. # print ( '' ) print ( ' Correlation between literacy and fertility:' ) print ( " print ( df['Female Literacy (%)'].corr( df['Fertility Rate'] ) )" ) print ( df['Female Literacy (%)'].corr( df['Fertility Rate'] ) ) input ( "RETURN" ) # # pandas can quickly produce simple, labeled plots and charts from a # DataFrame with a family of df.plot methods. By default, these use # Matplotlib. # print ( '' ) print ( ' Scatterplot of literacy and fertility:' ) print ( " df.plot.scatter ( 'Female Literacy (%)', 'Fertility Rate' )" ) import matplotlib.pyplot as plt df.plot.scatter ( 'Female Literacy (%)', 'Fertility Rate' ) filename = 'correlation_example.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) plt.show ( ) plt.close ( ) input ( "RETURN" ) # # Compute linear fit. # One literacy value was NaN. Drop that record # print ( '' ) print ( ' Drop record with NaN:' ) print ( ' df = df.dropna ( )' ) df = df.dropna ( ) print ( '' ) print ( ' Compute linear fit: fertility = a * literacy + b:' ) x = df[ 'Female Literacy (%)'].values y = df[ 'Fertility Rate' ].values a, b = llsq ( x, y ) print ( ' a = ', a, ' b = ', b ) input ( "RETURN" ) import numpy as np xmin = np.min ( x ) xmax = np.max ( x ) df.plot.scatter ( 'Female Literacy (%)', 'Fertility Rate' ) plt.plot ( [xmin,xmax], [a*xmin+b, a*xmax+b], 'r-' ) filename = 'correlation_fitted.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) plt.show ( ) plt.close ( ) return def llsq ( x, y ): import numpy as np xbar = np.mean ( x ) ybar = np.mean ( y ) xy = np.dot ( x, y - ybar ) xx = np.dot ( x, x - xbar ) a = xy / xx b = ybar - a * xbar return a, b if ( __name__ == "__main__" ): india_test ( )