# include # include int main ( ); __global__ void jacobi ( float *b_gpu, float *x1_gpu, float *x2_gpu, int n ); /******************************************************************************/ int main ( ) /******************************************************************************/ /* Purpose: CUDA program for the Jacobi iterative solution of A*x=b. Modified: 29 May 2017 Author: John Burkardt */ { float *b_cpu; float *b_gpu; int blocks; clock_t clock1; clock_t clock2; float e; float e_norm; float *x_cpu; float *x_exact; float *x1_gpu; float *x2_gpu; int i; int it; int memsize; int n = 10000; int threads; double value; printf ( "\n" ); printf ( "JACOBI:\n" ); printf ( " CUDA/C version.\n" ); printf ( " Apply Jacobi's iterative method to solve A*x=b\n" ); printf ( " System size is N = %d\n", n ); /* Choose number of blocks and threads. Note that max threads is probably 1024. */ if ( n <= 32 ) { blocks = 1; threads = n; } else { threads = 32; blocks = ( n - 1 ) / threads + 1; } printf ( " Blocks = %d, threads = %d, B*T=%d\n", blocks, threads, blocks * threads ); /* Set the vector data size in bytes. */ memsize = n * sizeof ( float ); /* Set the exact solution X. */ x_exact = ( float * ) malloc ( memsize ); for ( i = 0; i < n; i++ ) { x_exact[i] = ( float ) rand ( ) / ( float ) RAND_MAX; } /* Compute B based on the exact X. */ b_cpu = ( float * ) malloc ( memsize ); b_cpu[0] = 2.0 * x_exact[0] - x_exact[1]; for ( i = 1; i < n - 1; i++ ) { b_cpu[i] = - x_exact[i-1] + 2.0 * x_exact[i] - x_exact[i+1]; } b_cpu[n-1] = - x_exact[n-2] + 2.0 * x_exact[n-1]; /* Initialize the approximate solution. */ x_cpu = ( float * ) malloc ( memsize ); for ( i = 0; i < n; i++ ) { x_cpu[i] = 0.0; } /* Allocate B, X1, X2 on the GPU. */ cudaMalloc ( ( void** ) &b_gpu, memsize ); cudaMalloc ( ( void** ) &x1_gpu, memsize ); cudaMalloc ( ( void** ) &x2_gpu, memsize ); /* Copy b_gpu <= b_cpu, from host to device. Copy x1_gpu <= x_cpu, from host to device. Copy x2_gpu <= x_cpu, from host to device. */ cudaMemcpy ( b_gpu, b_cpu, memsize, cudaMemcpyHostToDevice ); cudaMemcpy ( x1_gpu, x_cpu, memsize, cudaMemcpyHostToDevice ); /* Launch the kernel on the device. Use X1 to update X2, then X2 to update X1. On return, latest solution estimate is in X1. */ clock1 = clock ( ); for ( it = 0; it < 10 * n; it++ ) { jacobi <<< threads, blocks >>> ( b_gpu, x1_gpu, x2_gpu, n ); jacobi <<< threads, blocks >>> ( b_gpu, x2_gpu, x1_gpu, n ); } /* Copy x_cpu <= x1_gpu from device to host. */ cudaMemcpy ( x_cpu, x1_gpu, memsize, cudaMemcpyDeviceToHost ); clock2 = clock ( ); value = ( double ) ( clock2 - clock1 ) / ( double ) CLOCKS_PER_SEC; /* Compute the residual norm. */ e_norm = 0.0; for ( i = 0; i < n; i++ ) { e = 2.0 * x_cpu[i] - b_cpu[i]; if ( 0 < i ) { e = e - x_cpu[i-1]; } if ( i < n - 1 ) { e = e - x_cpu[i+1]; } e_norm = e_norm + e * e; } e_norm = sqrt ( e_norm / ( float ) ( n ) ); printf ( "\n" ); printf ( " RMS error ||A*x-b|| = %g\n", e_norm ); /* Compute the approximation error norm. */ e_norm = 0.0; for ( i = 0; i < n; i++ ) { e_norm = e_norm + pow ( x_exact[i] - x_cpu[i], 2 ); } e_norm = sqrt ( e_norm / ( float ) ( n ) ); printf ( " RMS error ||x(exact) - x(computed)|| = %g\n", e_norm ); printf ( " Computation required %g seconds\n", value ); /* Print no more than 10 entries of the data. */ printf ( "\n" ); printf ( " I B[i] X[i] Xexact[i] (Xexact-X)[i] (A*X-B)[I]\n" ); printf ( "\n" ); for ( i = 0; i < n && i < 10; i++ ) { e = 2.0 * x_cpu[i] - b_cpu[i]; if ( 0 < i ) { e = e - x_cpu[i-1]; } if ( i < n - 1 ) { e = e - x_cpu[i+1]; } printf ( " %4d %8.4f %8.4f %8.4f %8.2e %8.2e\n", i, b_cpu[i], x_cpu[i], x_exact[i], x_exact[i] - x_cpu[i], e ); } /* Free CPU memory. */ free ( b_cpu ); free ( x_cpu ); free ( x_exact ); /* Free GPU memory. */ cudaFree ( b_gpu ); cudaFree ( x1_gpu ); cudaFree ( x2_gpu ); /* Reset GPU. */ cudaDeviceReset ( ); /* Terminate. */ printf ( "\n" ); printf ( "JACOBI:\n" ); printf ( " Normal end of execution.\n" ); return 0; } /******************************************************************************/ __global__ void jacobi ( float *b_gpu, float *x1_gpu, float *x2_gpu, int n ) /******************************************************************************/ { /* Purpose: JACOBI carries out one thread of one step of Jacobi iteration. Discussion: Specifically, for 0 < i < N - 1 - x(i-1) + 2 x(i) - x(i+1) = b(i) is rewritten x2(i) = ( b(i) + x(i-1) + x(i+1) ) / 2 for i = 0, we have x2(i) = ( b(i) + x(i+1) ) / 2 and for i = n - 1 we have x2(i) = ( b(i) + x(i-1) ) / 2 Modified: 29 May 2017 Author: John Burkardt */ int k = threadIdx.x + blockDim.x * blockIdx.x; float xim1; float xip1; if ( k < n ) { if ( 0 < k ) { xim1 = x1_gpu[k-1]; } else { xim1 = 0.0; } if ( k < n - 1 ) { xip1 = x1_gpu[k+1]; } else { xip1 = 0.0; } x2_gpu[k] = 0.5 * ( b_gpu[k] + xim1 + xip1 ); } return; }