# include int main ( ); __global__ void sizes ( int n ); int main ( ) { int n; printf ( "\n" ); printf ( "SIZES:\n" ); printf ( " CUDA/C version.\n" ); printf ( " Make sense of CUDA block and thread data.\n" ); // // Launch the kernel on the device. // dim3 blocks ( 2, 3, 1 ); dim3 threads ( 2, 1, 4 ); n = 40; printf ( "\n" ); printf ( " BLOCKS: ( %d, %d, %d )\n", blocks.x, blocks.y, blocks.z ); printf ( " THREADS: ( %d, %d, %d )\n", threads.x, threads.y, threads.z ); printf ( " Tasks N: %d\n", n ); printf ( "\n" ); printf ( "gridDim.x, gridDim.y, gridDim.z, blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z, threadIdx.x, threadIdx.y, threadIdx.z\n" ); printf ( "\n" ); sizes <<< blocks, threads >>> ( n ); // // This call is required to flush the output printed by the GPU. // cudaDeviceSynchronize ( ); // // Terminate. // printf ( "\n" ); printf ( "SIZES:\n" ); printf ( " Normal end of execution.\n" ); return 0; } __global__ void sizes ( int n ) { // // Purpose: // // SIZES reports on the values of various indices and dimensions. // int chunk; int k; int u = threadIdx.x; int v = threadIdx.y; int w = threadIdx.z; int x = blockIdx.x; int y = blockIdx.y; int z = blockIdx.z; int U = blockDim.x; int V = blockDim.y; int W = blockDim.z; int X = gridDim.x; int Y = gridDim.y; int Z = gridDim.z; k = u + U * v + U * V * w + U * V * W * x + U * V * W * X * y + U * V * W * X * Y * z; chunk = U * V * W * X * Y * Z; printf ( "%d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", k, chunk, gridDim.x, gridDim.y, gridDim.z, blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z, threadIdx.x, threadIdx.y, threadIdx.z ); /* k = ; t = k; while ( t < n ) { print t = t + ?; } */ return; }