#! /usr/bin/env python3 # def exercise4 ( ): #*****************************************************************************80 # ## exercise4 seeks the best two-part linear model for the Mexico data. # # Modified: # # 28 January 2022 # from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt import numpy as np print ( "exercise4:" ) print ( " Find best two-part linear model for Mexico population data." ) # # Get the data. # filename = 'mexico_population_data.txt' data = np.loadtxt ( filename ) n, d = np.shape ( data ) print ( " Data from " + filename + " involves n =", n, "items with dimension d =", d ) # # Analyze the data # print ( " Data statistics:" ) print ( " Min = ", np.min ( data, axis = 0 ) ) print ( " Max = ", np.max ( data, axis = 0 ) ) print ( " Range = ", np.max ( data, axis = 0 ) - np.min ( data, axis = 0 ) ) print ( " Mean = ", np.mean ( data, axis = 0 ) ) print ( " Variance = ", np.var ( data, axis = 0 ) ) # # Display the year and population. # plt.clf ( ) plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 ) plt.grid ( True ) plt.xlabel ( '<--- Year --->' ) plt.ylabel ( '<--- Population --->' ) plt.title ( 'Mexico Population Data', fontsize = 16 ) filename = 'exercise4_mexico_data.png' plt.savefig ( filename ) print ( ' Graphics saved as "%s"' % ( filename ) ) plt.show ( ) plt.clf ( ) # # Set up the linear system X * c = y # X = np.c_ [ np.ones ( n ), data[:,0] ] y = data[:,1] # # Solve for c using numpy linalg.lstsq() # c, _, _, _ = np.linalg.lstsq ( X, y, rcond = None ) r = np.dot ( X, c ) - y mse = np.sum ( r**2 ) / n print ( '' ) print ( ' MSE for a single linear model is ', mse ) # # Plot linear relationship y = c[0] * 1 + c[1] * x[0] # year_min = np.min ( data[:,0] ) year_max = np.max ( data[:,0] ) year = np.linspace ( year_min, year_max, 51 ) pop = c[0] + c[1] * year plt.clf ( ) plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 ) plt.plot ( year, pop, 'r-', markersize = 8, linewidth = 2 ) plt.grid ( True ) plt.xlabel ( '<--- Year --->' ) plt.ylabel ( '<--- Population --->' ) plt.title ( 'Mexico Population Data and Model', fontsize = 16 ) filename = 'exercise4_mexico_fitted.png' plt.savefig ( filename ) print ( ' Graphics saved as "%s"' % ( filename ) ) plt.show ( ) plt.clf ( ) # # From visual inspection of plot, a good break year is data[4,0] = 1940. # data1 = data[0:5,:] n1 = data1.shape[0] X1 = np.c_ [ np.ones ( n1 ), data1[:,0] ] y1 = data1[:,1] c1, _, _, _ = np.linalg.lstsq ( X1, y1, rcond = None ) r1 = np.dot ( X1, c1 ) - y1 mse1 = np.sum ( r1**2 ) / n1 data2 = data[4:,:] n2 = data2.shape[0] X2 = np.c_ [ np.ones ( n2 ), data2[:,0] ] y2 = data2[:,1] c2, _, _, _ = np.linalg.lstsq ( X2, y2, rcond = None ) r2 = np.dot ( X2, c2 ) - y2 mse2 = np.sum ( r2**2 ) / n2 print ( ' MSE1+MSE2 for a split linear model is ', mse1 + mse2 ) # # Plot linear relationship y = c[0] * 1 + c[1] * x[0] # year1_min = np.min ( data1[:,0] ) year1_max = np.max ( data1[:,0] ) year1 = np.linspace ( year1_min, year1_max, 2 ) pop1 = c1[0] + c1[1] * year1 year2_min = np.min ( data2[:,0] ) year2_max = np.max ( data2[:,0] ) year2 = np.linspace ( year2_min, year2_max, 2 ) pop2 = c2[0] + c2[1] * year2 plt.clf ( ) plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 ) plt.plot ( year1, pop1, 'r-', markersize = 8, linewidth = 2 ) plt.plot ( year2, pop2, 'g-', markersize = 8, linewidth = 2 ) plt.grid ( True ) plt.xlabel ( '<--- Year --->' ) plt.ylabel ( '<--- Population --->' ) plt.title ( 'Mexico Population Data and Two Linear Models', fontsize = 16 ) filename = 'exercise4_mexico_fitted2.png' plt.savefig ( filename ) print ( ' Graphics saved as "%s"' % ( filename ) ) plt.show ( ) plt.clf ( ) return if ( __name__ == "__main__" ): exercise4 ( )