#! /usr/bin/env python3
#
def exercise4():

#*****************************************************************************80
#
## exercise4() uses kmeans to cluster data, and then assign new data to clusters.
#
#  Discussion:
#
#    The data is stored as a text file, with spaces used as the delimiter.
#
#  Licensing:
#
#    This code is distributed under the GNU LGPL license. 
#
#  Modified:
#
#    20 January 2022
#
#  Author:
#
#    John Burkardt
#
  from os.path import exists
  from sklearn.cluster import KMeans
  import matplotlib.pyplot as plt
  import numpy as np
  import os
  import wget

  print ( "exercise4():" )
  print ( "  Retrieve blobs data, clusters, centers." )
  print ( "  Work with 2000 data values, save 20 for later." )
  print ( "  Use inertia to choose k." )
  print ( "  Cluster the data." )
  print ( "  Now predict cluster assignments for last 20." )
  print ( "  Compare predictions to correct values." )
#
#  Get blobs_data.txt
#
  datafile = 'blobs_data.txt'
  if ( exists ( datafile ) ):
    print ( "  Already have a copy of " + datafile )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile
    datafile = wget.download ( url )

  X = np.loadtxt ( datafile )
  rows, cols = np.shape ( X )
  print ( '' )
  print ( '  "' + datafile + '" contains', rows, 'rows and', cols, 'columns.' )
#
#  Get blobs_clusters.txt
#
  datafile = 'blobs_clusters.txt'
  if ( exists ( datafile ) ):
    print ( "  Already have a copy of " + datafile )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile
    datafile = wget.download ( url )

  y = np.loadtxt ( datafile )
  y = y.astype ( int )
  rows = np.shape ( y )
  print ( '' )
  print ( '  "' + datafile + '" contains', rows, 'rows.' )
#
#  Get blobs_centers.txt
#
  datafile = 'blobs_centers.txt'
  if ( exists ( datafile ) ):
    print ( "  Already have a copy of " + datafile )
  else:
    print ( "  Downloading dataset file:" )
    url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile
    datafile = wget.download ( url )

  C = np.loadtxt ( datafile )
  rows, cols = np.shape ( C )
  print ( '' )
  print ( '  "' + datafile + '" contains', rows, 'rows and', cols, 'columns.' )
#
#  Split data into training and test sets.
#
  n1 = 2000
  X1 = X[0:n1,:]
  y1 = y[0:n1]

  n2 = 20
  X2 = X[n1:n1+n2,:]
  y2 = y[n1:n1+n2]

  print ( type ( y2 ) )

  plt.scatter ( X1[:,0], X1[:,1], c = y1, s = 1 )
  plt.xlabel ( "$x_1$" )
  plt.ylabel ( "$x_2$" )
  plotfile = 'exercise4_data.jpg'
  plt.savefig ( plotfile )
  print ( '  Graphics saved as "' + plotfile + '"' )
  plt.show ( )
  plt.close ( )
#
#  Apply kmeans to the data, and report the inertia.
#
  print ( '' )
  print ( '  k  Inertia' )
  print ( '' )
  kmax = 10
  inertia = np.zeros ( kmax )
  for k in range ( 1, kmax + 1 ):
    kmeans = KMeans ( n_clusters = k )
    y_pred = kmeans.fit_predict ( X1 )
    inertia[k-1] = kmeans.inertia_
    print ( '  %d  %g' % ( k, inertia[k-1] ) )
#
#  Plot the inertia.
#
  plt.plot ( np.arange ( 1, kmax + 1 ), inertia, 'bo-', linewidth = 3 )
  plt.grid ( True )
  plt.xlabel ( 'K: Number of clusters' )
  plt.ylabel ( 'Inertia(K)' )
  plt.title ( 'Blob inertia with increasing number of clusters' )
  plotfile = 'exercise4_inertia.jpg'
  plt.savefig ( plotfile )
  print ( '  Graphics saved as "' + plotfile + '"' )
  plt.show ( )
  plt.close ( )
#
#  Choose to use 5 clusters.
#
  k = 5
  kmeans = KMeans ( n_clusters = k )
  y1_pred = kmeans.fit_predict ( X1 )
#
#  Predict clusters for test data.
#
  y2_pred = kmeans.predict ( X2 )
#
#  For each test data value, compare blob center to cluster center.
#
  print ( "" )
  print ( "  Test data: blob center and cluster center:" )
  print ( "" )
  for i in range ( 0, n2 ):
    print ( i, C[y2[i],:], kmeans.cluster_centers_[y2_pred[i],:] )
#
#  Terminate.
#
  print ( "" )
  print ( "exercise4():" )
  print ( "  Normal end of execution." )

  return

if ( __name__ == "__main__" ):
  exercise4 ( )