#! /usr/bin/env python3 # def cancer_visualize_pca ( ): #*****************************************************************************80 # ## cancer_visualize_pca() uses PCA to visualize malignant/benign differences. # # Licensing: # # This code is distributed under the MIT license. # # Modified: # # 20 September 2023 # # Author: # # Andreas Mueller, Sarah Guido. # This version by John Burkardt. # # Reference: # # Andreas Mueller, Sarah Guido, # Introduction to Machine Learning with Python, # OReilly, 2017, # ISBN: 978-1-449-36941-5 # from sklearn.datasets import load_breast_cancer from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import mglearn import numpy as np import platform import sklearn print ( '' ) print ( 'cancer_visualize_pca():' ) print ( ' Python version: ' + platform.python_version ( ) ) print ( ' scikit-learn version: '+ sklearn.__version__ ) print ( ' Work with data from the cancer dataset.' ) print ( ' Use principal component analysis to visualize differences' ) print ( ' between malignant and benign cases.' ) # # Generate the dataset. # print ( '' ) print ( ' Retrieve the cancer dataset, (X, y).' ) cancer = load_breast_cancer ( ) # # Define the standard scaler, calibrate it for the cancer dataset, apply it. # scaler = StandardScaler ( ) scaler.fit ( cancer.data ) x_scaled = scaler.transform ( cancer.data ) # # Apply PCA, requesting first two components. # pca = PCA ( n_components = 2 ) pca.fit ( x_scaled ) # # Transfer data onto the first two components. # x_pca = pca.transform ( x_scaled ) print ( '' ) print ( ' Original shape: ', x_scaled.shape ) print ( ' Reduced shape: ', x_pca.shape ) # # Plot first versus second component. # plt.figure ( figsize = ( 8, 8 ) ) mglearn.discrete_scatter ( x_pca[:,0], x_pca[:,1], cancer.target ) plt.legend ( cancer.target_names, loc = 'best' ) plt.gca().set_aspect ( 'equal' ) plt.xlabel ( 'First principal component' ) plt.ylabel ( 'Second principal component' ) plt.grid ( True ) filename = 'cancer_visualize_pca_components.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) # # View the two PCA components. # print ( '' ) print ( ' PCA component shape:', pca.components_.shape ) print ( '' ) print ( ' PCA components:' ) print ( pca.components_ ) # # Heat map of coefficients. # plt.matshow ( pca.components_, cmap = 'viridis' ) plt.yticks ( [ 0, 1 ], [ 'First component', 'Second component' ] ) plt.colorbar ( ) plt.xticks ( range ( len ( cancer.feature_names ) ), cancer.feature_names, rotation = 60, ha = 'left' ) plt.xlabel ( 'Feature' ) plt.ylabel ( 'Principal components' ) filename = 'cancer_visualize_pca_coefficients.png' plt.savefig ( filename ) print ( ' Graphics saved as "' + filename + '"' ) # # Terminate. # print ( '' ) print ( 'cancer_visualize_pca():' ) print ( ' Normal end of execution.' ) return def timestamp ( ): #*****************************************************************************80 # ## timestamp() prints the date as a timestamp. # # Licensing: # # This code is distributed under the MIT license. # # Modified: # # 21 August 2019 # # Author: # # John Burkardt # import time t = time.time ( ) print ( time.ctime ( t ) ) return if ( __name__ == '__main__' ): timestamp ( ) cancer_visualize_pca ( ) timestamp ( )