# include # include # include # include "cuda_loop.h" /******************************************************************************/ void cuda_loop ( int blocks[3], int threads[3], int n ) /******************************************************************************/ /* Purpose: CUDA_LOOP simulates the behavior of a CUDA loop. Discussion: A CUDA kernel "kernel()" is invoked by a command of the form kernel << blocks, threads >> ( args ) where blocks and threads are each vectors of up to 3 values, listing the number of blocks and number of threads to be used. If a problem involves N tasks, then tasks are allotted to specific CUDA processes in an organized fashion. Some processes may get no tasks, one task, or multiple tasks. Each process is given variables that can be used to determine the tasks to be performed: gridDim.x, gridDim.y, gridDim.z: the block dimensions as given by the user in "blocks"; blockDim.x, blockDim.y, blockDim.z: the thread dimensions as given by the user in "threads"; blockIdx.x, blockIdx.y, blockId.z: the block indices for this process. threadIdx.x, threadIdx.y, threadIdx.z: the thread indices for this process. Essentially, a process can determine its linear index K by: K = threadIdx.x + blockdim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z + blockDim.x * blockDim.y * blockDim.z * blockIdx.x + blockDim.x * blockDim.y * blockDim.z * gridDim.x * blockIdx.y + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * blockIdx.z Set task T = K. while ( T < N ) carry out task T; T = T + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z. Licensing: This code is distributed under the MIT license. Modified: 22 March 2017 Author: John Burkardt Parameters: Input, int BLOCKS[3], the CUDA block values. These should be nonnegative. Typically, the third entry is 1. Generally, the first two values cannot be greater than 35,535. Input, int THREADS[3], the CUDA thread values. These should be nonnegative. Typically, there is a maximum value imposed on these quantities, which depends on the GPU model. Input, int N, the number of tasks to be carried out. */ { int blockDimx; int blockDimy; int blockDimz; int blockIdx; int blockIdy; int blockIdz; int chunk; int gridDimx; int gridDimy; int gridDimz; int k1; int k2; int t; int threadIdx; int threadIdy; int threadIdz; printf ( "\n" ); printf ( "CUDA_LOOP:\n" ); printf ( " Simulate the assignment of N tasks to the blocks\n" ); printf ( " and threads of a GPU using CUDA.\n" ); printf ( "\n" ); printf ( " Number of tasks is %d\n", n ); printf ( " BLOCKS: { %d, %d, %d }\n", blocks[0], blocks[1], blocks[2] ); printf ( " THREADS: { %d, %d, %d }\n", threads[0], threads[1], threads[2] ); k1 = 0; blockDimx = threads[0]; blockDimy = threads[1]; blockDimz = threads[2]; gridDimx = blocks[0]; gridDimy = blocks[1]; gridDimz = blocks[2]; chunk = blocks[1] * blocks[0] * threads[2] * threads[1] * threads[0]; printf ( " Total threads = %d\n", chunk ); printf ( "\n" ); printf ( " Process Process (bx,by,bz) (tx,ty,tz) Tasks...\n" ); printf ( " Increment Formula\n" ); printf ( "\n" ); for ( blockIdz = 0; blockIdz < gridDimz; blockIdz++ ) { for ( blockIdy = 0; blockIdy < gridDimy; blockIdy++ ) { for ( blockIdx = 0; blockIdx < gridDimx; blockIdx++ ) { for ( threadIdz = 0; threadIdz < blockDimz; threadIdz++ ) { for ( threadIdy = 0; threadIdy < blockDimy; threadIdy++ ) { for ( threadIdx = 0; threadIdx < blockDimx; threadIdx++ ) { t = k1; k2 = threadIdx + blockDimx * threadIdy + blockDimx * blockDimy * threadIdz + blockDimx * blockDimy * blockDimz * blockIdx + blockDimx * blockDimy * blockDimz * gridDimx * blockIdy + blockDimx * blockDimy * blockDimz * gridDimx * gridDimy * blockIdz; printf ( " %7d %7d: (%2d,%2d,%2d) (%2d,%2d,%2d)", k1, k2, blockIdx, blockIdy, blockIdz, threadIdx, threadIdy, threadIdz ); while ( t < n ) { printf ( "%3d", t ); t = t + chunk; } printf ( "\n" ); k1 = k1 + 1; } } } } } } return; } /******************************************************************************/ void timestamp ( ) /******************************************************************************/ /* Purpose: TIMESTAMP prints the current YMDHMS date as a time stamp. Example: 17 June 2014 09:45:54 AM Licensing: This code is distributed under the MIT license. Modified: 17 June 2014 Author: John Burkardt Parameters: None */ { # define TIME_SIZE 40 static char time_buffer[TIME_SIZE]; const struct tm *tm; time_t now; now = time ( NULL ); tm = localtime ( &now ); strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); printf ( "%s\n", time_buffer ); return; # undef TIME_SIZE }