#! /usr/bin/env python3
#
import matplotlib.pyplot as plt
import numpy as np

def cancer_classify_gradboost ( ):

#*****************************************************************************80
#
## cancer_classify_gradboost() uses gradient boosting to classify cancer data.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    31 July 2023
#
#  Author:
#
#    Andreas Mueller, Sarah Guido.
#    Modifications by John Burkardt.
#
#  Reference:
#
#    Andreas Mueller, Sarah Guido,
#    Introduction to Machine Learning with Python,
#    OReilly, 2017,
#    ISBN: 978-1-449-36941-5
#
  from sklearn.ensemble import GradientBoostingClassifier
  import mglearn
  import pandas as pd
  import platform
  import seaborn as sns
  import sklearn

  print ( '' )
  print ( 'cancer_classify_gradboost():' )
  print ( '  Python version: ' + platform.python_version ( ) )
  print ( '  scikit-learn version: '+ sklearn.__version__ )
#
#  Generate the dataset.
#
  print ( '' )
  print ( '  Retrieve the cancer dataset, (X, y).' )
  from sklearn.datasets import load_breast_cancer
  cancer = load_breast_cancer ( )
#
#  Split the dataset.
#
  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split ( \
    cancer.data, cancer.target, stratify = cancer.target, random_state = 0 )
#
#  Get the classifier.
#
  gbrt = GradientBoostingClassifier ( random_state = 0 )
#
#  Fit the training data.
#
  gbrt.fit ( X_train, y_train )
#
#  Compute training and testing accuracy for varying number of neighbors.
#
  print ( '' )
  print ( '  Training accuracy = ', gbrt.score ( X_train, y_train ) )
  print ( '  Testing accuracy  = ', gbrt.score ( X_test, y_test ) )
#
#  Because the training set accuracy is very high, we may be overfitting.
#  We can limit the maximum depth or lower the learning rate.
#
#  Limit the learning rate.
#
  gbrt = GradientBoostingClassifier ( random_state = 0, learning_rate = 0.01 )
  gbrt.fit ( X_train, y_train )
  print ( '' )
  print ( 'Repeat calculation with learning rate decreased to 0.01:' )
  print ( '  Training accuracy = ', gbrt.score ( X_train, y_train ) )
  print ( '  Testing accuracy  = ', gbrt.score ( X_test, y_test ) )
#
#  Limit the maximum depth.
#
  gbrt = GradientBoostingClassifier ( random_state = 0, max_depth = 1 )
  gbrt.fit ( X_train, y_train )
  print ( '' )
  print ( 'Repeat calculation with max_depth restricted to 1:' )
  print ( '  Training accuracy = ', gbrt.score ( X_train, y_train ) )
  print ( '  Testing accuracy  = ', gbrt.score ( X_test, y_test ) )
#
#  Plot the feature importances for the maximum depth = 1 case.
#
  plot_feature_importances_cancer ( cancer, gbrt )
  filename = 'cancer_classify_gradboost.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "' + filename + '"' )
#
#  Terminate.
#
  print ( '' )
  print ( 'cancer_classify_gradboost():' )
  print ( '  Normal end of execution.' )

  return

def plot_feature_importances_cancer ( dataset, model ):

#*****************************************************************************80
#
## plot_feature_importances_cancer() plots the feature importances.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    21 July 2023
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    dataset: the dataset.
#
#    model: the model being used.
#
  n_features = dataset.data.shape[1]
  plt.barh ( 
    np.arange ( n_features ), 
    model.feature_importances_, 
    align = 'center' )
  plt.yticks ( 
    np.arange ( n_features ),
    dataset.feature_names )
  plt.xlabel ( 'Feature importance' )
  plt.ylabel ( 'Feature' )
  plt.ylim ( -1, n_features )

  return

def timestamp ( ):

#*****************************************************************************80
#
## timestamp() prints the date as a timestamp.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    21 August 2019
#
#  Author:
#
#    John Burkardt
#
  import time

  t = time.time ( )
  print ( time.ctime ( t ) )

  return

if ( __name__ == '__main__' ):
  timestamp ( )
  cancer_classify_gradboost ( )
  timestamp ( )