#! /usr/bin/env python3 # def exercise3(): #*****************************************************************************80 # ## exercise3() processes the Ruspini dataset. # # Discussion: # # The data is stored as a text file, with spaces used as the delimiter. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 19 January 2022 # # Author: # # John Burkardt # from os.path import exists from sklearn.cluster import KMeans import matplotlib.pyplot as plt import numpy as np import os import wget print ( "exercise3():" ) print ( " Process the Ruspini data." ) # # Get a copy of the datafile from the website. # datafile = 'ruspini_data.txt' if ( exists ( datafile ) ): print ( " Already have a copy of " + datafile ) else: print ( " Downloading dataset file:" ) url = 'https://people.sc.fsu.edu/~jburkardt/classes/ml_2022/datasets/' + datafile datafile = wget.download ( url ) # # Read the data from the file. # data = np.loadtxt ( datafile ) rows, cols = np.shape ( data ) print ( '' ) print ( ' "' + datafile + '" contains', rows, 'rows and', cols, 'columns.' ) # # Print the first five lines. # print ( '' ) print ( ' First five lines of data:' ) print ( '' ) for i in range ( 0, 5 ): print ( ' (%d) ' % ( i ), end = '' ) for j in range ( 0, cols ): print ( ' %8.2g' % ( data[i,j] ), end = '' ) print ( '' ) # # Compute statistical data. # s = np.zeros ( [ 7, cols ] ) for j in range ( 0, cols ): s[0,j] = np.min ( data[:,j] ) s[1,j] = np.max ( data[:,j] ) s[2,j] = s[1,j] - s[0,j] s[3,j] = np.mean ( data[:,j] ) s[4,j] = np.var ( data[:,j] ) s[5,j] = np.sqrt ( s[4,j] ) s[6,j] = np.linalg.norm ( data[:,j] ) / np.sqrt ( cols ) labels = ( 'Minimum ', 'Maximum ', 'Range ', 'Mean ', 'Variance', \ 'STD ', 'RMS ' ) print ( '' ) print ( ' Statistics for data:' ) print ( '' ) for i in range ( 0, 7 ): print ( ' %8s ' % ( labels[i] ), end = '' ) for j in range ( 0, cols ): print ( ' %8.2g' % ( s[i,j] ), end = '' ) print ( '' ) # # Standardize the data. # data2 = np.zeros ( [ rows, cols ] ) for j in range ( 0, cols ): data2[:,j] = ( data[:,j] - s[3,j] ) / s[5,j] # # Create a scatter plot. # plt.scatter ( data2[:,0], data2[:,1] ) tc = np.linspace ( 0, 2.0 * np.pi, 51 ) xc = np.cos ( tc ) yc = np.sin ( tc ) plt.plot ( xc, yc, 'r-', linewidth = 2 ) plt.plot ( 2.0*xc, 2.0*yc, 'r-', linewidth = 2 ) plt.plot ( 3.0*xc, 3.0*yc, 'r-', linewidth = 2 ) plt.xlabel ( 'X' ) plt.ylabel ( 'Y' ) plt.title ( 'Ruspini dataset' ) plt.grid ( True ) plt.axis ( 'equal' ) plotfile = 'exercise3.jpg' plt.savefig ( plotfile ) print ( ' Graphics saved as "' + plotfile + '"' ) plt.show ( ) plt.close ( ) # # Apply kmeans to the data, and report the inertia. # print ( '' ) print ( ' k Inertia' ) print ( '' ) kmax = 10 inertia = np.zeros ( kmax ) for k in range ( 1, kmax + 1 ): kmeans = KMeans ( n_clusters = k ) y_pred = kmeans.fit_predict ( data2 ) inertia[k-1] = kmeans.inertia_ print ( ' %d %g' % ( k, inertia[k-1] ) ) # # Plot the inertia. # plt.plot ( np.arange ( 1, kmax + 1 ), inertia, 'bo-', linewidth = 3 ) plt.grid ( True ) plt.xlabel ( 'K: Number of clusters' ) plt.ylabel ( 'Inertia(K)' ) plt.title ( 'Ruspini inertia with increasing number of clusters' ) plotfile = 'exercise3_inertia.jpg' plt.savefig ( plotfile ) print ( ' Graphics saved as "' + plotfile + '"' ) plt.show ( ) plt.close ( ) # # Use the chosen value of K to cluster the data. # k = 4 kmeans = KMeans ( n_clusters = k ) y = kmeans.fit_predict ( data2 ) # # Plot the clusters using different colors. # plt.clf ( ) for i in range ( 0, k ): plt.scatter ( data2[y==i,0], data2[y==i,1] ) plt.scatter ( kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], \ c = 'black', s = 250, marker = '*' ) plt.title ( 'Ruspini data' ) plt.grid ( True ) plt.axis ( 'equal' ) plotfile = 'exercise3_clusters.jpg' plt.savefig ( plotfile ) print ( ' Graphics saved as "' + plotfile + '"' ) plt.show ( ) plt.close ( ) # # Discard the data file. # if ( False ): print ( "" ) print ( " Deleting dataset file:" ) os.remove ( datafile ) # # Terminate. # print ( "" ) print ( "exercise3():" ) print ( " Normal end of execution." ) return if ( __name__ == "__main__" ): exercise3 ( )