#! /usr/bin/env python3 # def exercise4(): #*****************************************************************************80 # ## exercise4() uses kmeans to cluster data, and then assign new data to clusters. # # Discussion: # # The data is stored as a text file, with spaces used as the delimiter. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 20 January 2022 # # Author: # # John Burkardt # from os.path import exists from sklearn.cluster import KMeans import matplotlib.pyplot as plt import numpy as np import os import wget print ( "exercise4():" ) print ( " Retrieve blobs data, clusters, centers." ) print ( " Work with 2000 data values, save 20 for later." ) print ( " Use inertia to choose k." ) print ( " Cluster the data." ) print ( " Now predict cluster assignments for last 20." ) print ( " Compare predictions to correct values." ) # # Get blobs_data.txt # datafile = 'blobs_data.txt' if ( exists ( datafile ) ): print ( " Already have a copy of " + datafile ) else: print ( " Downloading dataset file:" ) url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile datafile = wget.download ( url ) X = np.loadtxt ( datafile ) rows, cols = np.shape ( X ) print ( '' ) print ( ' "' + datafile + '" contains', rows, 'rows and', cols, 'columns.' ) # # Get blobs_clusters.txt # datafile = 'blobs_clusters.txt' if ( exists ( datafile ) ): print ( " Already have a copy of " + datafile ) else: print ( " Downloading dataset file:" ) url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile datafile = wget.download ( url ) y = np.loadtxt ( datafile ) y = y.astype ( int ) rows = np.shape ( y ) print ( '' ) print ( ' "' + datafile + '" contains', rows, 'rows.' ) # # Get blobs_centers.txt # datafile = 'blobs_centers.txt' if ( exists ( datafile ) ): print ( " Already have a copy of " + datafile ) else: print ( " Downloading dataset file:" ) url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile datafile = wget.download ( url ) C = np.loadtxt ( datafile ) rows, cols = np.shape ( C ) print ( '' ) print ( ' "' + datafile + '" contains', rows, 'rows and', cols, 'columns.' ) # # Split data into training and test sets. # n1 = 2000 X1 = X[0:n1,:] y1 = y[0:n1] n2 = 20 X2 = X[n1:n1+n2,:] y2 = y[n1:n1+n2] print ( type ( y2 ) ) plt.scatter ( X1[:,0], X1[:,1], c = y1, s = 1 ) plt.xlabel ( "$x_1$" ) plt.ylabel ( "$x_2$" ) plotfile = 'exercise4_data.jpg' plt.savefig ( plotfile ) print ( ' Graphics saved as "' + plotfile + '"' ) plt.show ( ) plt.close ( ) # # Apply kmeans to the data, and report the inertia. # print ( '' ) print ( ' k Inertia' ) print ( '' ) kmax = 10 inertia = np.zeros ( kmax ) for k in range ( 1, kmax + 1 ): kmeans = KMeans ( n_clusters = k ) y_pred = kmeans.fit_predict ( X1 ) inertia[k-1] = kmeans.inertia_ print ( ' %d %g' % ( k, inertia[k-1] ) ) # # Plot the inertia. # plt.plot ( np.arange ( 1, kmax + 1 ), inertia, 'bo-', linewidth = 3 ) plt.grid ( True ) plt.xlabel ( 'K: Number of clusters' ) plt.ylabel ( 'Inertia(K)' ) plt.title ( 'Blob inertia with increasing number of clusters' ) plotfile = 'exercise4_inertia.jpg' plt.savefig ( plotfile ) print ( ' Graphics saved as "' + plotfile + '"' ) plt.show ( ) plt.close ( ) # # Choose to use 5 clusters. # k = 5 kmeans = KMeans ( n_clusters = k ) y1_pred = kmeans.fit_predict ( X1 ) # # Predict clusters for test data. # y2_pred = kmeans.predict ( X2 ) # # For each test data value, compare blob center to cluster center. # print ( "" ) print ( " Test data: blob center and cluster center:" ) print ( "" ) for i in range ( 0, n2 ): print ( i, C[y2[i],:], kmeans.cluster_centers_[y2_pred[i],:] ) # # Terminate. # print ( "" ) print ( "exercise4():" ) print ( " Normal end of execution." ) return if ( __name__ == "__main__" ): exercise4 ( )