#! /usr/bin/env python3
#
def exercise1 ( ):

#*****************************************************************************80
#
## exercise1 seeks the best linear model for two temperature items.
#
#  Modified:
#
#    27 January 2022
#
  from sklearn.linear_model import LinearRegression
  import matplotlib.pyplot as plt
  import numpy as np

  print ( "exercise1:" )
  print ( "  Find best linear model for two items of temperature data." )
#
#  Get the data.
#
  filename = 'two_temperatures_data.txt'
  data = np.loadtxt ( filename )
  n, d = np.shape ( data )
  print ( "  Data from " + filename + " involves n =", n, "items with dimension d =", d )
#
#  Analyze the data
#
  print ( "  Data statistics:" )
  print ( "    Min      = ", np.min ( data, axis = 0 ) )
  print ( "    Max      = ", np.max ( data, axis = 0 ) )
  print ( "    Range    = ", np.max ( data, axis = 0 ) - np.min ( data, axis = 0 ) )
  print ( "    Mean     = ", np.mean ( data, axis = 0 ) )
  print ( "    Variance = ", np.var ( data, axis = 0 ) )
#
#  Display F versus C.
#
  plt.clf ( )
  plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 )
  plt.grid ( True )
  plt.xlabel ( '<--- Fahrenheit --->' )
  plt.ylabel ( '<--- Celsius --->' )
  plt.title ( 'Two temperatures, Fahrenheit/Celsius', fontsize = 16 )
  filename = 'exercise1_two_temperatures_data.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "%s"' % ( filename ) )
  plt.show ( )
  plt.clf ( )
#
#  Set up the linear system X * c = y
#
  X = np.c_ [ np.ones ( n ), data[:,0] ]
  y = data[:,1]
#
#  1) Set up and solve the normal equations.
#
  XTX = np.dot ( X.T, X )
  XTb = np.dot ( X.T, y )
  c1 = np.linalg.solve ( XTX, XTb )
  r1 = np.dot ( X, c1 ) - y
  mse1 = np.sum ( r1**2 ) / n
#
#  2) Use QR factorization linalg.qr().
#
  Q, R = np.linalg.qr ( X )
  QTb = np.dot ( Q.T, y )
  c2 = np.linalg.solve ( R, QTb )
  r2 = np.dot ( X, c2 ) - y
  mse2 = np.sum ( r2**2 ) / n
#
#  3) Use SVD pseudoinverse pinv().
#
  P = np.linalg.pinv ( X )
  c3 = np.dot ( P, y )
  r3 = np.dot ( X, c3 ) - y
  mse3 = np.sum ( r3**2 ) / n
#
#  4) Use numpy linalg.lstsq()
#
  c4, _, _, _ = np.linalg.lstsq ( X, y, rcond = None )
  r4 = np.dot ( X, c4 ) - y
  mse4 = np.sum ( r4**2 ) / n
#
#  5) Use scikit-learn LinearRegression()
#
  lin_reg = LinearRegression()
  lin_reg.fit ( X, y )
  c5 = np.r_ [ lin_reg.intercept_, lin_reg.coef_[1:] ]
  r5 = np.dot ( X, c5 ) - y
  mse5 = np.sum ( r5**2 ) / n
#
#  Tabulate results.
#
  print ( '' )
  print ( '  Method  ----------------c-------------     mse(r)' )
  print ( '' );
  print ( '  Normal    ', c1, '  ', mse1 )
  print ( '  QR        ', c2, '  ', mse2 )
  print ( '  PINV      ', c3, '  ', mse3 )
  print ( '  LSTSQ     ', c4, '  ', mse4 );
  print ( '  sklearn   ', c5, '  ', mse5 )
#
#  Plot linear relationship y = c[0] * 1 + c[1] * x[0]
#
  fahr_min = np.min ( data[:,0] )
  fahr_max = np.max ( data[:,0] )
  fahr = np.linspace ( fahr_min, fahr_max, 51 )
  celsius = ( c1[0] + c1[1] * fahr )
  plt.clf ( )
  plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 )
  plt.plot ( fahr, celsius, 'r-', markersize = 8, linewidth = 2 )
  plt.grid ( True )
  plt.xlabel ( '<--- Fahrenheit --->' )
  plt.ylabel ( '<--- Celsius --->' )
  plt.title ( 'Two Temperature Data and Model', fontsize = 16 )
  filename = 'exercise1_two_temperatures_fitted.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "%s"' % ( filename ) )
  plt.show ( )
  plt.clf ( )
#
#  Rescale c and coefficient of y(which was -1) by 160/c[0]. 
#
  print ( '' )
  print ( '  Rescaled coefficient vectors C[0], C[1], C[2]' );

  for c in [ c1, c2, c3, c4, c5 ]:
    s = 160 / c[0]
    cy = -1.0 * s
    c =     c * s
    print ( c, cy )

  return

if ( __name__ == "__main__" ):
  exercise1 ( )