#! /usr/bin/env python3
#
def random_line ( ):

#*****************************************************************************80
#
## random_line analyzes the random data.
#
#  Modified:
#
#    26 January 2022
#
  import matplotlib.pyplot as plt
  import numpy as np

  print ( "random_line:" )
#
#  Get the data.
#
  filename = 'random_data.txt'
  data = np.loadtxt ( filename )
  n, d = np.shape ( data )
  print ( "  Data from " + filename + " involves n =", n, "items with dimension d =", d )
#
#  Analyze the data
#
  print ( "" )
  print ( "  Data statistics:" )
  print ( "    Min      = ", np.min ( data, axis = 0 ) )
  print ( "    Max      = ", np.max ( data, axis = 0 ) )
  print ( "    Range    = ", np.max ( data, axis = 0 ) - np.min ( data, axis = 0 ) )
  print ( "    Mean     = ", np.mean ( data, axis = 0 ) )
  print ( "    Variance = ", np.var ( data, axis = 0 ) )
#
#  Display the data as (x,y).
#
  plt.clf ( )
  plt.plot ( data[:,0], data[:,1], 'bo' )
  plt.grid ( True )
  plt.xlabel ( '<--- x --->' )
  plt.ylabel ( '<--- y=(3x+5+normal)/2 --->' )
  plt.title ( 'Random data', fontsize = 16 )
  filename = 'random_line.png'
  plt.savefig ( filename )
  print ( '' )
  print ( '  Graphics saved as "%s"' % ( filename ) )
  plt.show ( )
  plt.clf ( )
#
#  Create a column vector of 1's.
#  Create a new data vector X which begins with the column of 1's.
#
  x0 = np.ones ( [ n, 1 ] )
  X = np.c_ [ x0, data[:,0:d-1] ]
  y = data[:,d-1]

  print ( '' )
  print ( '  Computed coefficient vector c:' )
#
#  1: Normal equations.
#
  XTX = np.matmul ( np.transpose ( X ), X )
  XTy = np.matmul ( np.transpose ( X ), y )
  c1 = np.linalg.solve ( XTX, XTy )
  print ( '  1: Normal equations: ', c1 )
  r1 = np.dot ( X, c1 ) - y
  mse1 = np.sum ( r1**2 ) / n
#
#  2: QR.
#
  Q, R = np.linalg.qr ( X )
  QTy = np.dot ( Q.T, y )
  c2 = np.linalg.solve ( R, QTy )
  print ( '  2: QR factors:       ', c2 )
  r2 = np.dot ( X, c2 ) - y
  mse2 = np.sum ( r2**2 ) / n
#
#  3: SVD pseudoinverse.
#
  P = np.linalg.pinv ( X )
  c3 = np.dot ( P, y )
  print ( '  3: SVD pseudoinverse ', c3 )
  r3 = np.dot ( X, c3 ) - y
  mse3 = np.sum ( r3**2 ) / n
#
#  4: Numpy lstsq
#  The unusual notation arises because lstsq returns 4 things, and
#  we only want the first one.  The ugly alternative is:
#
#  c4, _, _, _ = np.linalg.lstsq ( X, y, rcond = None )
#
  c4 = np.linalg.lstsq ( X, y, rcond = None )[0]
  print ( '  4: numpy lstsq()     ', c4 )
  r4 = np.dot ( X, c4 ) - y
  mse4 = np.sum ( r4**2 ) / n
#
#  5: scikit-learn
#
  from sklearn.linear_model import LinearRegression
  lin_reg = LinearRegression()
  lin_reg.fit ( X, y )
  c5 = np.r_ [ lin_reg.intercept_, lin_reg.coef_[1:] ] # Observe underscores!
  print ( '  5: sklearn           ', c5 )
  r5 = np.dot ( X, c5 ) - y
  mse5 = np.sum ( r5**2 ) / n
#
#  Report errors.
#
  print ( '' )
  print ( '  MSE (mean square errors)' )
  print ( '  1: Normal equations: ', mse1 )
  print ( '  2: QR factors:       ', mse2 )
  print ( '  3: SVD pseudoinverse ', mse3 )
  print ( '  4: numpy lstsq()     ', mse4 )
  print ( '  5: sklearn           ', mse5 )
#
#  Display the data and fitting line.
#
  plt.clf ( )
  plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 )
  m1 = np.min ( data[:,0] )
  m2 = np.max ( data[:,0] )
  p1 = c1[0] + c1[1] * m1
  p2 = c1[0] + c1[1] * m2
  plt.plot ( [m1,m2], [p1,p2], 'r-', linewidth = 2 )
  plt.grid ( True )
  plt.xlabel ( '<--- x --->' )
  plt.ylabel ( '<--- y=fitted line --->' )
  plt.title ( 'Least squares fit to random data', fontsize = 16 )
  filename = 'random_line_fit.png'
  plt.savefig ( filename )
  print ( '' )
  print ( '  Graphics saved as "%s"' % ( filename ) )
  plt.show ( )
  plt.close ( )

  return

def random_data_generate ( ):

#*****************************************************************************80
#
## random_data_generate generates the random data.
#
#  Modified:
#
#    26 January 2022
#
  import matplotlib.pyplot as plt
  import numpy as np

  n = 100
  x = -1.0 + 4.0 * np.random.rand ( n )
  y = 0.5 * ( 3.0 * x + 5.0 + np.random.normal ( size = n ) )

  filename = 'random_data.txt'
  output = open ( filename, 'w' )
  for i in range ( 0, n ):
    s = '  %8.4g  %8.4g\n' % ( x[i], y[i] )
    output.write ( s )
  output.close ( )
  print ( '  Data saved as "%s"' % ( filename ) )

  return

if ( __name__ == "__main__" ):
  random_line ( )