#! /usr/bin/env python3
#
def exercise4 ( ):

#*****************************************************************************80
#
## exercise4 seeks the best two-part linear model for the Mexico data.
#
#  Modified:
#
#    28 January 2022
#
  from sklearn.linear_model import LinearRegression
  import matplotlib.pyplot as plt
  import numpy as np

  print ( "exercise4:" )
  print ( "  Find best two-part linear model for Mexico population data." )
#
#  Get the data.
#
  filename = 'mexico_population_data.txt'
  data = np.loadtxt ( filename )
  n, d = np.shape ( data )
  print ( "  Data from " + filename + " involves n =", n, "items with dimension d =", d )
#
#  Analyze the data
#
  print ( "  Data statistics:" )
  print ( "    Min      = ", np.min ( data, axis = 0 ) )
  print ( "    Max      = ", np.max ( data, axis = 0 ) )
  print ( "    Range    = ", np.max ( data, axis = 0 ) - np.min ( data, axis = 0 ) )
  print ( "    Mean     = ", np.mean ( data, axis = 0 ) )
  print ( "    Variance = ", np.var ( data, axis = 0 ) )
#
#  Display the year and population.
#
  plt.clf ( )
  plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 )
  plt.grid ( True )
  plt.xlabel ( '<--- Year --->' )
  plt.ylabel ( '<--- Population --->' )
  plt.title ( 'Mexico Population Data', fontsize = 16 )
  filename = 'exercise4_mexico_data.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "%s"' % ( filename ) )
  plt.show ( )
  plt.clf ( )
#
#  Set up the linear system X * c = y
#
  X = np.c_ [ np.ones ( n ), data[:,0] ]
  y = data[:,1]
#
#  Solve for c using numpy linalg.lstsq()
#
  c, _, _, _ = np.linalg.lstsq ( X, y, rcond = None )
  r = np.dot ( X, c ) - y
  mse = np.sum ( r**2 ) / n
  print ( '' )
  print ( '  MSE for a single linear model is ', mse )
#
#  Plot linear relationship y = c[0] * 1 + c[1] * x[0]
#
  year_min = np.min ( data[:,0] )
  year_max = np.max ( data[:,0] )
  year = np.linspace ( year_min, year_max, 51 )
  pop = c[0] + c[1] * year
  plt.clf ( )
  plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 )
  plt.plot ( year, pop, 'r-', markersize = 8, linewidth = 2 )
  plt.grid ( True )
  plt.xlabel ( '<--- Year --->' )
  plt.ylabel ( '<--- Population --->' )
  plt.title ( 'Mexico Population Data and Model', fontsize = 16 )
  filename = 'exercise4_mexico_fitted.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "%s"' % ( filename ) )
  plt.show ( )
  plt.clf ( )
#
#  From visual inspection of plot, a good break year is data[4,0] = 1940.
#
  data1 = data[0:5,:]
  n1 = data1.shape[0]
  X1 = np.c_ [ np.ones ( n1 ), data1[:,0] ]
  y1 = data1[:,1]
  c1, _, _, _ = np.linalg.lstsq ( X1, y1, rcond = None )
  r1 = np.dot ( X1, c1 ) - y1
  mse1 = np.sum ( r1**2 ) / n1

  data2 = data[4:,:]
  n2 = data2.shape[0]
  X2 = np.c_ [ np.ones ( n2 ), data2[:,0] ]
  y2 = data2[:,1]
  c2, _, _, _ = np.linalg.lstsq ( X2, y2, rcond = None )
  r2 = np.dot ( X2, c2 ) - y2
  mse2 = np.sum ( r2**2 ) / n2
  print ( '  MSE1+MSE2 for a split linear model is ', mse1 + mse2 )
#
#  Plot linear relationship y = c[0] * 1 + c[1] * x[0]
#
  year1_min = np.min ( data1[:,0] )
  year1_max = np.max ( data1[:,0] )
  year1 = np.linspace ( year1_min, year1_max, 2 )
  pop1 = c1[0] + c1[1] * year1

  year2_min = np.min ( data2[:,0] )
  year2_max = np.max ( data2[:,0] )
  year2 = np.linspace ( year2_min, year2_max, 2 )
  pop2 = c2[0] + c2[1] * year2

  plt.clf ( )
  plt.plot ( data[:,0], data[:,1], 'bo', markersize = 8 )
  plt.plot ( year1, pop1, 'r-', markersize = 8, linewidth = 2 )
  plt.plot ( year2, pop2, 'g-', markersize = 8, linewidth = 2 )
  plt.grid ( True )
  plt.xlabel ( '<--- Year --->' )
  plt.ylabel ( '<--- Population --->' )
  plt.title ( 'Mexico Population Data and Two Linear Models', fontsize = 16 )
  filename = 'exercise4_mexico_fitted2.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "%s"' % ( filename ) )
  plt.show ( )
  plt.clf ( )

  return

if ( __name__ == "__main__" ):
  exercise4 ( )