#! /usr/bin/env python3 # def basketball_test ( ): import matplotlib.pyplot as plt import numpy as np import pandas as pd print ( '' ) print ( 'basketball_test():' ) print ( ' Read basketball data from csv file.' ) # # Read the basketball dataset. # Use column 1 (Player name) for the row index. # filename = 'basketball_data.csv' print ( '' ) print ( ' Read the data from "' + filename + '"' ) data = pd.read_csv ( filename, header = 0, index_col = 1 ) # # Print a small part of the data # print ( '' ) print ( ' Print a small part of the data:' ) print ( ' print ( data.head() )' ) print ( data.head() ) # # Try to print the data # print ( '' ) print ( ' Try to print the whole DataFrame:' ) print ( ' print ( data )' ) print ( data ) # # Try again to print the data # print ( '' ) print ( ' Try again to print the whole DataFrame:' ) print ( ' print ( data.to_string() )' ) print ( data.to_string() ) # # List column labels # print ( '' ) print ( ' print data.columns' ) print ( data.columns ) # # List player names # print ( '' ) print ( ' print data.index' ) print ( data.index ) # # List player 'Wade' # print ( '' ) print ( ' List player Wade' ) print ( data.loc['Wade'] ) # # List player heights # print ( '' ) print ( ' List heights' ) print ( data.loc[:,'Height'] ) # # List weight of player Pete # print ( '' ) print ( ' List weight of player Pete' ) print ( data.loc['Pete','Weight'] ) # # List players with 200 <= Height # print ( '' ) print ( ' List players who are tall:' ) print ( data.loc[200 < data['Height'] ] ) # # Add a row. # print ( '' ) print ( ' Add a new row, Zorg:' ) data.loc['Zorg'] = [ 27, 'center', 190, 210, 0, 'no', 'rookie', 24 ] print ( data ) # # Add a column. # print ( '' ) print ( ' Add a new column, PointAverage:' ) data['PointAverage'] = [ \ 0.2, 5.0, 6.3, 5.5, 4.6,\ 3.3, 4.1, 0.8, 0.8, 5.3,\ 3.8, 1.8, 3.2, 4.1, 1.6,\ 7.6, 1.5, 0.3, 2.1, 2.9,\ 4.3, 2.2, 2.5, 1.0, 4.8,\ 1.5, 3.6 ] # # Average and sum point averages. # point_average = data.loc[:,'PointAverage'].mean() print ( '' ) print ( ' Compute average and sum of point averages:' ) print ( ' Average personal points per game = ', point_average ) point_sum = data.loc[:,'PointAverage'].sum() print ( ' Average total points per game = ', point_sum ) # # Averages per position. # print ( '' ) print ( ' Compute average points per position:' ) forward = data.loc[data['Position'] == 'forward'] point_forward = forward.loc[:,'PointAverage'].mean() print ( ' Average forward points per game = ', point_forward ) center = data.loc[data['Position'] == 'center'] point_center = center.loc[:,'PointAverage'].mean() print ( ' Average center points per game = ', point_center ) guard = data.loc[data['Position'] == 'guard'] point_guard = guard.loc[:,'PointAverage'].mean() print ( ' Average guard points per game = ', point_guard ) # # Modify a value. # print ( '' ) print ( ' Correct height of Dale:' ) print ( ' Before:', data.loc['Dale','Height'] ) data.loc['Dale','Height'] = 203 print ( ' After: ', data.loc['Dale','Height'] ) # # Min, Mean, Max, std # print ( '' ) print ( ' Print minimum of numeric data' ) print ( data.min ( numeric_only = True ) ) print ( '' ) print ( ' Print mean of numeric data' ) print ( data.mean ( numeric_only = True ) ) print ( '' ) print ( ' Print maximum of numeric data' ) print ( data.max ( numeric_only = True ) ) print ( '' ) print ( ' Print std of numeric data' ) print ( data.std ( numeric_only = True ) ) # # How are data items correlated? # print ( '' ) print ( ' Compute all correlations:' ) print ( ' data.corr ( numeric_only = True )' ) print ( data.corr ( numeric_only = True ) ) print ( '' ) print ( ' Compute all correlations (Try harder!):' ) print ( ' data.corr ( numeric_only = True ).to_string()' ) print ( data.corr ( numeric_only = True ).to_string() ) # # Histogram Ages # print ( '' ) print ( ' Histogram player age' ) data.hist ( column = 'Age', bins = 5, rwidth = 0.95 ) plt.grid ( True ) plt.title ( 'Player ages' ) filename = 'age_hist.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) plt.show ( ) plt.close ( ) # # Scatterplot Height vs Weight # print ( '' ) print ( ' Plot player height versus weight' ) data.plot.scatter ( 'Height', 'Weight' ) plt.grid ( True ) plt.title ( 'Tall players weigh more!' ) filename = 'height_weight_scatter.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) plt.show ( ) plt.close ( ) # # Scatterplot Age versus Sponsorship # print ( '' ) print ( ' Plot player age versus sponsorship earnings' ) data.plot.scatter ( 'Age', 'SponsorshipEarnings' ) plt.grid ( True ) plt.title ( 'Young players earn more!' ) filename = 'age_earnings_scatter.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) plt.show ( ) plt.close ( ) # # Linear regression for height and weight. # print ( '' ) print ( ' Estimate linear model' ) print ( ' weight = slope * height + intercept' ) x = data['Height'] y = data['Weight'] a, b = llsq ( x, y ) print ( 'computed linear model' ) print ( ' model slope = ', a ) print ( ' model intercept = ', b ) data.plot.scatter ( 'Height', 'Weight' ) xmin = np.min ( x ) xmax = np.max ( x ) ymin = a * xmin + b ymax = a * xmax + b plt.plot ( [ xmin, xmax ], [ ymin, ymax ], 'r-', linewidth = 3 ) plt.grid ( True ) plt.title ( 'Tall players weigh more!' ) filename = 'height_weight_linear.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) plt.show ( ) plt.close ( ) # # Linear regression model for age and sponsorship. # print ( '' ) print ( ' Estimate linear model' ) print ( ' earnings = slope * age + intercept' ) x = data['Age'] y = data['SponsorshipEarnings'] a, b = llsq ( x, y ) print ( 'computed linear model' ) print ( ' model slope = ', a ) print ( ' model intercept = ', b ) data.plot.scatter ( 'Age', 'SponsorshipEarnings' ) xmin = np.min ( x ) xmax = np.max ( x ) ymin = a * xmin + b ymax = a * xmax + b plt.plot ( [ xmin, xmax ], [ ymin, ymax ], 'r-', linewidth = 3 ) plt.grid ( True ) plt.title ( 'Young players earn more!' ) filename = 'age_earnings_linear.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) plt.show ( ) plt.close ( ) # # Save modified data to a new csv file. # print ( '' ) print ( ' Save revised data to a new csv file.' ) filename = 'basketball2_data.csv' data.to_csv ( filename ) print ( ' Data stored as "' + filename + '"' ) return def llsq ( x, y ): ## llsq() finds best linear fit y = a x + b # import numpy as np xbar = np.mean ( x ) ybar = np.mean ( y ) xy = np.dot ( x, y - ybar ) xx = np.dot ( x, x - xbar ) a = xy / xx b = ybar - a * xbar return a, b if ( __name__ == "__main__" ): basketball_test ( )