# include int main ( ); __global__ void laplace ( float *t1_gpu, float *t2_gpu, int m, int n ); __device__ float bc_gpu ( float x, float y ); __device__ float rhs_gpu ( float x, float y ); /******************************************************************************/ int main ( ) /******************************************************************************/ /* Purpose: laplace_solve() is a CUDA program for the iterative solution of the Laplace equation. Discussion: The region is the rectangle [0.0,2.0] x [0.0, 1.0]. A grid of 41 points in the X direction and 21 in the Y direction is used. If the temperatures are stored in a 21x41 array T, then the shape of the array will correspond exactly to the shape of the physical grid if we set, for i = 0...m-1, j = 0...n-1: x(j) = j /(n-1) y(i) = (m-1-i)/(m-1) We want to solve the discretized heat equation: ( 4*t(i,j) - t(i-1,j) - t(i+1,j) - t(i,j-1) - t(i,j+1) ) / delta^2 = f(x,y) where delta is the uniform spacing, with boundary conditions t(0.0, *) = 10.0 t( *,0.0) = 0.0 t( *,1.0) = 0.0 t(2.0, *) = 100.0 We use 100 steps of Jacobi iteration. Modified: 12 March 2017 */ { int i; int j; int it; int m; int memsize; int n; float *t_cpu; float *t1_gpu; float *t2_gpu; float x; float y; printf ( "\n" ); printf ( "laplace_solve():\n" ); printf ( " CUDA/C version.\n" ); printf ( " Solve the Laplace equation on an MxN grid.\n" ); /* Set cell dimensions; */ m = 21; n = 41; /* Allocate T on the CPU. */ memsize = m * n * sizeof ( float ); t_cpu = ( float * ) malloc ( memsize ); /* Initialize T. */ for ( j = 0; j < n; j++ ) { for ( i = 0; i < m; i++ ) { t_cpu[i+j*m] = 0.0; } } /* Allocate T1, T2 on the GPU. */ cudaMalloc ( ( void** ) &t1_gpu, memsize ); cudaMalloc ( ( void** ) &t2_gpu, memsize ); /* Copy x1_gpu <= t_cpu, from host to device. */ cudaMemcpy ( t1_gpu, t_cpu, memsize, cudaMemcpyHostToDevice ); /* Launch the kernel on the device. Use T1 to update T2, then T2 to update T1. On return, latest solution estimate is in T1. */ dim3 thread_grid ( m, n, 1 ); for ( it = 0; it < 100; it++ ) { laplace <<< 1, thread_grid >>> ( t1_gpu, t2_gpu, m, n ); laplace <<< 1, thread_grid >>> ( t2_gpu, t1_gpu, m, n ); } /* Copy t_cpu <= t1_gpu from device to host. */ cudaMemcpy ( t_cpu, t1_gpu, memsize, cudaMemcpyDeviceToHost ); /* Print solution. */ for ( j = 0; j < n; j++ ) { x = 0.0 + 2.0 * float ( j ) / float ( n - 1 ); for ( i = 0; i < m; i++ ) { y = 0.0 + 1.0 * float ( m - 1 - i ) / float ( m - 1 ); printf ( " %8.4f %8.4f %14.6g\n", x, y, t_cpu[i+j*m] ); /* printf ( " %3d %3d %8.4f %8.4f %14.6g\n", i, j, x, y, t_cpu[i+j*m] ); */ } printf ( "\n" ); } /* Free CPU memory. */ free ( t_cpu ); /* Free GPU memory. */ cudaFree ( t1_gpu ); cudaFree ( t2_gpu ); /* Terminate. */ printf ( "\n" ); printf ( "laplace_solve():\n" ); printf ( " Normal end of execution.\n" ); return 0; } /******************************************************************************/ __global__ void laplace ( float *t1_gpu, float *t2_gpu, int m, int n ) /******************************************************************************/ { /* Purpose: laplace(): one thread of one step of Jacobi iteration for Laplace's equation. Discussion: The region is the rectangle [0.0,2.0]x[0.0,1.0], Modified: 12 March 2017 */ float delta; int i = threadIdx.x; int j = threadIdx.y; float tim1; float tip1; float tjm1; float tjp1; float x; float y; x = 0.0 + 2.0 * float ( j ) / float ( n - 1 ); y = 0.0 + 1.0 * float ( m - 1 - i ) / float ( m - 1 ); if ( i == 0 || i == m - 1 || j == 0 || j == n - 1 ) { t2_gpu[i+j*m] = bc_gpu ( x, y ); } else { tim1 = t1_gpu[i-1+j*m]; tip1 = t1_gpu[i+1+j*m]; tjm1 = t1_gpu[i+(j-1)*m]; tjp1 = t1_gpu[i+(j+1)*m]; delta = 1.0 / ( float ( m - 1 ) ); t2_gpu[i+j*m] = 0.25 * ( rhs_gpu ( x, y ) * delta * delta + tim1 + tip1 + tjm1 + tjp1 ); } return; } /******************************************************************************/ __device__ float bc_gpu ( float x, float y ) /******************************************************************************/ { float eps = 0.000001; float value; if ( fabs ( y ) <= eps || fabs ( y - 1.0 ) <= eps ) { value = 0.0; } else if ( fabs ( x ) <= eps ) { value = 10.0; } else if ( fabs ( x - 2.0 ) <= eps ) { value = 100.0; } else { value = 0.0; } return value; } /******************************************************************************/ __device__ float rhs_gpu ( float x, float y ) /******************************************************************************/ { float value = 0.0; return value; }