#! /usr/bin/env python3 # def random_line ( ): #*****************************************************************************80 # ## random_line analyzes the random data. # # Modified: # # 26 January 2022 # import matplotlib.pyplot as plt import numpy as np print ( "random_line:" ) # # Get the data. # filename = 'random_data.txt' data = np.loadtxt ( filename ) n, d = np.shape ( data ) print ( " Data from " + filename + " involves n =", n, "items with dimension d =", d ) # # Analyze the data # print ( "" ) print ( " Data statistics:" ) print ( " Min = ", np.min ( data, axis = 0 ) ) print ( " Max = ", np.max ( data, axis = 0 ) ) print ( " Range = ", np.max ( data, axis = 0 ) - np.min ( data, axis = 0 ) ) print ( " Mean = ", np.mean ( data, axis = 0 ) ) print ( " Variance = ", np.var ( data, axis = 0 ) ) # # Display the data as (x,y). # plt.clf ( ) plt.plot ( data[:,0], data[:,1], 'bo' ) plt.grid ( True ) plt.xlabel ( '<--- x --->' ) plt.ylabel ( '<--- y=(3x+5+normal)/2 --->' ) plt.title ( 'Random data', fontsize = 16 ) filename = 'random_line.png' plt.savefig ( filename ) print ( '' ) print ( ' Graphics saved as "%s"' % ( filename ) ) plt.show ( ) plt.clf ( ) # # Create a column vector of 1's. # Create a new data vector X which begins with the column of 1's. # x0 = np.ones ( [ n, 1 ] ) X = np.c_ [ x0, data[:,0:d-1] ] y = data[:,d-1] print ( '' ) print ( ' Computed coefficient vector c:' ) # # 1: Normal equations. # XTX = np.matmul ( np.transpose ( X ), X ) XTy = np.matmul ( np.transpose ( X ), y ) c1 = np.linalg.solve ( XTX, XTy ) print ( ' 1: Normal equations: ', c1 ) r1 = np.dot ( X, c1 ) - y mse1 = np.sum ( r1**2 ) / n # # 2: QR. # Q, R = np.linalg.qr ( X ) QTy = np.dot ( Q.T, y ) c2 = np.linalg.solve ( R, QTy ) print ( ' 2: QR factors: ', c2 ) r2 = np.dot ( X, c2 ) - y mse2 = np.sum ( r2**2 ) / n # # 3: SVD pseudoinverse. # P = np.linalg.pinv ( X ) c3 = np.dot ( P, y ) print ( ' 3: SVD pseudoinverse ', c3 ) r3 = np.dot ( X, c3 ) - y mse3 = np.sum ( r3**2 ) / n # # 4: Numpy lstsq # The unusual notation arises because lstsq returns 4 things, and # we only want the first one. The ugly alternative is: # # c4, _, _, _ = np.linalg.lstsq ( X, y, rcond = None ) # c4 = np.linalg.lstsq ( X, y, rcond = None )[0] print ( ' 4: numpy lstsq() ', c4 ) r4 = np.dot ( X, c4 ) - y mse4 = np.sum ( r4**2 ) / n # # 5: scikit-learn # from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit ( X, y ) c5 = np.r_ [ lin_reg.intercept_, lin_reg.coef_[1:] ] # Observe underscores! print ( ' 5: sklearn ', c5 ) r5 = np.dot ( X, c5 ) - y mse5 = np.sum ( r5**2 ) / n # # Report errors. # print ( '' ) print ( ' MSE (mean square errors)' ) print ( ' 1: Normal equations: ', mse1 ) print ( ' 2: QR factors: ', mse2 ) print ( ' 3: SVD pseudoinverse ', mse3 ) print ( ' 4: numpy lstsq() ', mse4 ) print ( ' 5: sklearn ', mse5 ) # # Display the data and fitting line. # plt.clf ( ) plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 ) m1 = np.min ( data[:,0] ) m2 = np.max ( data[:,0] ) p1 = c1[0] + c1[1] * m1 p2 = c1[0] + c1[1] * m2 plt.plot ( [m1,m2], [p1,p2], 'r-', linewidth = 2 ) plt.grid ( True ) plt.xlabel ( '<--- x --->' ) plt.ylabel ( '<--- y=fitted line --->' ) plt.title ( 'Least squares fit to random data', fontsize = 16 ) filename = 'random_line_fit.png' plt.savefig ( filename ) print ( '' ) print ( ' Graphics saved as "%s"' % ( filename ) ) plt.show ( ) plt.close ( ) return def random_data_generate ( ): #*****************************************************************************80 # ## random_data_generate generates the random data. # # Modified: # # 26 January 2022 # import matplotlib.pyplot as plt import numpy as np n = 100 x = -1.0 + 4.0 * np.random.rand ( n ) y = 0.5 * ( 3.0 * x + 5.0 + np.random.normal ( size = n ) ) filename = 'random_data.txt' output = open ( filename, 'w' ) for i in range ( 0, n ): s = ' %8.4g %8.4g\n' % ( x[i], y[i] ) output.write ( s ) output.close ( ) print ( ' Data saved as "%s"' % ( filename ) ) return if ( __name__ == "__main__" ): random_line ( )