#! /usr/bin/env python3 # import matplotlib.pyplot as plt import numpy as np def cancer_classify_gradboost ( ): #*****************************************************************************80 # ## cancer_classify_gradboost() uses gradient boosting to classify cancer data. # # Licensing: # # This code is distributed under the MIT license. # # Modified: # # 31 July 2023 # # Author: # # Andreas Mueller, Sarah Guido. # Modifications by John Burkardt. # # Reference: # # Andreas Mueller, Sarah Guido, # Introduction to Machine Learning with Python, # OReilly, 2017, # ISBN: 978-1-449-36941-5 # from sklearn.ensemble import GradientBoostingClassifier import mglearn import pandas as pd import platform import seaborn as sns import sklearn print ( '' ) print ( 'cancer_classify_gradboost():' ) print ( ' Python version: ' + platform.python_version ( ) ) print ( ' scikit-learn version: '+ sklearn.__version__ ) # # Generate the dataset. # print ( '' ) print ( ' Retrieve the cancer dataset, (X, y).' ) from sklearn.datasets import load_breast_cancer cancer = load_breast_cancer ( ) # # Split the dataset. # from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split ( \ cancer.data, cancer.target, stratify = cancer.target, random_state = 0 ) # # Get the classifier. # gbrt = GradientBoostingClassifier ( random_state = 0 ) # # Fit the training data. # gbrt.fit ( X_train, y_train ) # # Compute training and testing accuracy for varying number of neighbors. # print ( '' ) print ( ' Training accuracy = ', gbrt.score ( X_train, y_train ) ) print ( ' Testing accuracy = ', gbrt.score ( X_test, y_test ) ) # # Because the training set accuracy is very high, we may be overfitting. # We can limit the maximum depth or lower the learning rate. # # Limit the learning rate. # gbrt = GradientBoostingClassifier ( random_state = 0, learning_rate = 0.01 ) gbrt.fit ( X_train, y_train ) print ( '' ) print ( 'Repeat calculation with learning rate decreased to 0.01:' ) print ( ' Training accuracy = ', gbrt.score ( X_train, y_train ) ) print ( ' Testing accuracy = ', gbrt.score ( X_test, y_test ) ) # # Limit the maximum depth. # gbrt = GradientBoostingClassifier ( random_state = 0, max_depth = 1 ) gbrt.fit ( X_train, y_train ) print ( '' ) print ( 'Repeat calculation with max_depth restricted to 1:' ) print ( ' Training accuracy = ', gbrt.score ( X_train, y_train ) ) print ( ' Testing accuracy = ', gbrt.score ( X_test, y_test ) ) # # Plot the feature importances for the maximum depth = 1 case. # plot_feature_importances_cancer ( cancer, gbrt ) filename = 'cancer_classify_gradboost.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) # # Terminate. # print ( '' ) print ( 'cancer_classify_gradboost():' ) print ( ' Normal end of execution.' ) return def plot_feature_importances_cancer ( dataset, model ): #*****************************************************************************80 # ## plot_feature_importances_cancer() plots the feature importances. # # Licensing: # # This code is distributed under the MIT license. # # Modified: # # 21 July 2023 # # Author: # # John Burkardt # # Input: # # dataset: the dataset. # # model: the model being used. # n_features = dataset.data.shape[1] plt.barh ( np.arange ( n_features ), model.feature_importances_, align = 'center' ) plt.yticks ( np.arange ( n_features ), dataset.feature_names ) plt.xlabel ( 'Feature importance' ) plt.ylabel ( 'Feature' ) plt.ylim ( -1, n_features ) return def timestamp ( ): #*****************************************************************************80 # ## timestamp() prints the date as a timestamp. # # Licensing: # # This code is distributed under the MIT license. # # Modified: # # 21 August 2019 # # Author: # # John Burkardt # import time t = time.time ( ) print ( time.ctime ( t ) ) return if ( __name__ == '__main__' ): timestamp ( ) cancer_classify_gradboost ( ) timestamp ( )