#! /usr/bin/env python3
#
def iris_classify_knn ( ):

#*****************************************************************************80
#
## iris_classify_knn() uses k-nearest neighbor classification on iris data.
#
#  Discussion:
#
#    There are three species of iris.
#
#    Each species has characteristic values for four quantities:
#      petal length,
#      petal width,
#      sepal length,
#      sepal width.
#
#    We are given data containing 150 samples, specifying the species and
#    the four measurements.
#
#    Our task is to construct a model which can accept a set of measurements
#    corresponding to a new iris, and estimate the corresponding species
#    to which it belongs.
#
#    We will do this using the k-nearest-neighbors algorithm, which classifies
#    a new data item by looking at the k nearest sets of measurements, and
#    choosing the species that most of those neighbors share.  In this example,
#    k is simply 1.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    15 June 2023
#
#  Author:
#
#    Andreas Mueller, Sarah Guido.
#    Modifications by John Burkardt.
#
#  Reference:
#
#    Andreas Mueller, Sarah Guido,
#    Introduction to Machine Learning with Python,
#    OReilly, 2017,
#    ISBN: 978-1-449-36941-5
#
  import matplotlib.pyplot as plt
  import mglearn
  import numpy as np
  import pandas as pd
  import platform
  import sklearn
  from sklearn.datasets import load_iris

  print ( '' )
  print ( 'iris_classify_knn():' )
  print ( '  Python version: ' + platform.python_version ( ) )
  print ( '  scikit-learn version: '+ sklearn.__version__ )
  print ( '  Retrieve the iris data set.' )
  print ( '  Classify the data.' )
  print ( '  Use the k-nearest neighbor method.' )
  print ( '' )
#
#  Load the iris dataset.
#
  iris_dataset = load_iris ( )
#
#  Print the keys.
#
  print ( "  iris_dataset.keys():" )
  print ( iris_dataset.keys ( ) )
#
#  Print the dataset description.
#
  print ( '' )
  print ( "  iris_dataset['DESCR']:" )
  print ( iris_dataset['DESCR'] )
#
#  Print the names of the three species.
#
  print ( '' )
  print ( "  iris_dataset['target_names']:" )
  print ( iris_dataset['target_names'] )
#
#  Print the feature names.
#
  print ( '' )
  print ( "  iris_dataset['feature_names']:" )
  print ( iris_dataset['feature_names'] )
#
#  Print the type, shape, and sample of data.
#
  print ( '' )
  print ( "  type ( iris_dataset['data'] ):" )
  print ( type ( iris_dataset['data'] ) )
  print ( "  iris_dataset['data'].shape:" )
  print ( iris_dataset['data'].shape )
  print ( '' )
  print ( "  Initial rows of data:" )
  print ( iris_dataset['data'][:5] )
#
#  Print the type, shape, and all the target values.
#
  print ( '' )
  print ( "  type ( iris_dataset['target'] ):" )
  print ( type ( iris_dataset['target'] ) )
  print ( "  iris_dataset['target'].shape:" )
  print ( iris_dataset['target'].shape )
  print ( '' )
  print ( "  target values:" )
  print ( iris_dataset['target'] )
#
#  Randomly split the data into training and testing sets.
#  For reproducibility, specify the initial random state.
#
  from sklearn.model_selection import train_test_split

  print ( '' )
  print ( 'Randomly split the data into training and testing sets.' )

  X_train, X_test, y_train, y_test = train_test_split ( \
    iris_dataset['data'], iris_dataset['target'], random_state = 0 )
#
#  To verify, print shapes and samples of training and testing sets.
#
  print ( '  X_train.shape = ', X_train.shape )
  print ( '  y_train.shape = ', y_train.shape )

  print ( '  X_test.shape = ', X_test.shape )
  print ( '  y_test.shape = ', y_test.shape )
#
#  Create a dataframe so we can use pandas to plot.
#
  iris_dataframe = pd.DataFrame ( X_train, columns = iris_dataset.feature_names )

  pd.plotting.scatter_matrix ( iris_dataframe, c = y_train, figsize = (15,15), \
    marker = 'o', hist_kwds = {'bins' : 20 }, s = 60, alpha = 0.8, \
    cmap = mglearn.cm3 )

  plt.savefig ( 'iris_classify_knn.png' )
#
#  Get the k-nearest-neighbors classifier.
#
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier ( n_neighbors = 1 )
#
#  Build the model using the training set.
#
  knn.fit ( X_train, y_train )

  KNeighborsClassifier ( \
    algorithm = 'auto', \
    leaf_size = 30, \
    metric = 'minkowski', \
    metric_params = None, \
    n_jobs = None, \
    n_neighbors = 1, \
    p = 2, \
    weights = 'uniform' )
#
#  Use the model to predict the class of a new piece of data.
#
  print ( "" )
  print ( "Use the model to predict the class of a new piece of data." )
  X_new = np.array ( [ [ 5.0, 2.9, 1.0, 0.2 ] ] )
  print ( "  X_new.shape:", X_new.shape )

  y_new = knn.predict ( X_new )
  print ( "  Prediction:", y_new )
  print ( "  Predicted target name:", \
    iris_dataset [ 'target_names' ][y_new] )
#
#  Use the test data to verify the model.
#
  print ( "" )
  print ( "Use the test data to verify the model." )

  y_pred = knn.predict ( X_test )
  print ( "  Test set predictions\n", y_pred )
  print ( "  Test set score:", np.mean ( y_pred == y_test ) )
  print ( "  Test set score:", knn.score ( X_test, y_test ) )
#
#  Terminate.
#
  print ( '' )
  print ( 'iris_classify_knn():' )
  print ( '  Normal end of execution.' )

  return

def timestamp ( ):

#*****************************************************************************80
#
## timestamp() prints the date as a timestamp.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    21 August 2019
#
#  Author:
#
#    John Burkardt
#
  import time

  t = time.time ( )
  print ( time.ctime ( t ) )

  return

if ( __name__ == '__main__' ):
  timestamp ( )
  iris_classify_knn ( )
  timestamp ( )