# include <stdio.h>
# include <stdlib.h>
# include <time.h>

int main ( int argc, char *argv[] );
void cuda_tasks ( int blocks[3], int threads[3], int n );
void timestamp ( );

/******************************************************************************/

int main ( int argc, char *argv[] )

/******************************************************************************/
/*
  Purpose:

    MAIN is the main program for CUDA_TASKS.

  Discussion:

    A CUDA kernel "kernel()" is invoked by a command of the form
      
      kernel << blocks, threads >> ( args )

    where blocks and threads are each vectors of up to 3 values,
    listing the number of blocks and number of threads to be used.

    If a problem involves N tasks, then tasks are allotted to 
    specific CUDA processes in an organized fashion.  Some processes
    may get no tasks, one task, or multiple tasks.  

    Each process is given variables that can be used to determine
    the tasks to be performed:

      gridDim.x, gridDim.y, gridDim.z: the block dimensions as
      given by the user in "blocks";

      blockDim.x, blockDim.y, blockDim.z: the thread dimensions as
      given by the user in "threads";

      blockIdx.x, blockIdx.y, blockId.z: the block indices for this process.

      threadIdx.x, threadIdx.y, threadIdx.z: the thread indices for this process.

    Essentially, a process can determine its linear index K by:

      K = threadIdx.x
        +  blockdim.x  * threadIdx.y
        +  blockDim.x  *  blockDim.y  * threadIdx.z
        +  blockDim.x  *  blockDim.y  *  blockDim.z  * blockIdx.x
        +  blockDim.x  *  blockDim.y  *  blockDim.z  *  gridDim.x  * blockIdx.y
        +  blockDim.x  *  blockDim.y  *  blockDim.z  *  gridDim.x  *  gridDim.y  * blockIdx.z

    Set task T = K.

    while ( T < N )
      carry out task T;
      T = T + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z.

    This program suggests how a specific set of block and thread parameters 
    would determine the assignment of individual tasks to CUDA processes.

  Licensing:

    This code is distributed under the GNU LGPL license. 

  Modified:

    28 March 2017

  Author:

    John Burkardt

  Parameters:

    Local, int BLOCKS[3], the CUDA block values.  These should be nonnegative.
    Typically, the third entry is 1.  Generally, the first two values cannot
    be greater than 35,535.

    Local, int THREADS[3], the CUDA thread values.  These should be nonnegative.
    Typically, there is a maximum value imposed on these quantities, which
    depends on the GPU model.

    Local, int N, the number of tasks to be carried out.
*/
{
  int blocks[3];
  int n;
  int threads[3];
/*
  Command line: cuda_loops b0 b1 b2 t0 t1 t2 n

  We trust the user to input 7 arguments correctly!
*/
  blocks[0] = atoi ( argv[1] );
  blocks[1] = atoi ( argv[2] );
  blocks[2] = atoi ( argv[3] );
  threads[0] = atoi ( argv[4] );
  threads[1] = atoi ( argv[5] );
  threads[2] = atoi ( argv[6] );
  n = atoi ( argv[7] );
/*
  Determine how CUDA would assign the tasks to the blocks and threads.
*/
  cuda_tasks ( blocks, threads, n );

  return 0;
}
/******************************************************************************/

void cuda_tasks ( int blocks[3], int threads[3], int n )

/******************************************************************************/
/*
  Purpose:

    CUDA_TASKS simulates how CUDA assigns tasks to blocks and threads.

  Discussion:

    A CUDA kernel "kernel()" is invoked by a command of the form
      
      kernel << blocks, threads >> ( args )

    where blocks and threads are each vectors of up to 3 values,
    listing the number of blocks and number of threads to be used.

    If a problem involves N tasks, then tasks are allotted to 
    specific CUDA processes in an organized fashion.  Some processes
    may get no tasks, one task, or multiple tasks.  

    Each process is given variables that can be used to determine
    the tasks to be performed:

      gridDim.x, gridDim.y, gridDim.z: the block dimensions as
      given by the user in "blocks";

      blockDim.x, blockDim.y, blockDim.z: the thread dimensions as
      given by the user in "threads";

      blockIdx.x, blockIdx.y, blockId.z: the block indices for this process.

      threadIdx.x, threadIdx.y, threadIdx.z: the thread indices for this process.

    Essentially, a process can determine its linear index K by:

      K = threadIdx.x
        +  blockdim.x  * threadIdx.y
        +  blockDim.x  *  blockDim.y  * threadIdx.z
        +  blockDim.x  *  blockDim.y  *  blockDim.z  * blockIdx.x
        +  blockDim.x  *  blockDim.y  *  blockDim.z  *  gridDim.x  * blockIdx.y
        +  blockDim.x  *  blockDim.y  *  blockDim.z  *  gridDim.x  *  gridDim.y  * blockIdx.z

    Set task T = K.

    while ( T < N )
      carry out task T;
      T = T + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z.

  Licensing:

    This code is distributed under the GNU LGPL license. 

  Modified:

    22 March 2017

  Author:

    John Burkardt

  Parameters:

    Input, int BLOCKS[3], the CUDA block values.  These should be nonnegative.
    Typically, the third entry is 1.  Generally, the first two values cannot
    be greater than 35,535.

    Input, int THREADS[3], the CUDA thread values.  These should be nonnegative.
    Typically, there is a maximum value imposed on these quantities, which
    depends on the GPU model.

    Input, int N, the number of tasks to be carried out.
*/
{
  int blockDimx;
  int blockDimy;
  int blockDimz;
  int blockIdx;
  int blockIdy;
  int blockIdz;
  int chunk;
  int gridDimx;
  int gridDimy;
  int gridDimz;
  int k1;
  int k2;
  int t;
  int threadIdx;
  int threadIdy;
  int threadIdz;

  printf ( "\n" );
  printf ( "CUDA_TASKS:\n" );
  printf ( "  Simulate the assignment of N tasks to the blocks\n" );
  printf ( "  and threads of a GPU using CUDA.\n" );
  printf ( "\n" );
  printf ( "  Number of tasks is %d\n", n );
  printf ( "  BLOCKS:  { %d, %d, %d }\n", blocks[0], blocks[1], blocks[2] );
  printf ( "  THREADS: { %d, %d, %d }\n", threads[0], threads[1], threads[2] );

  k1 = 0;

  blockDimx = threads[0];
  blockDimy = threads[1];
  blockDimz = threads[2];

  gridDimx = blocks[0];
  gridDimy = blocks[1];
  gridDimz = blocks[2];

  chunk = blocks[1] * blocks[0] * threads[2] * threads[1] * threads[0];
  printf ( "  Total threads = %d\n", chunk );
  printf ( "\n" );
  printf ( "  Process   Process (bx,by,bz) (tx,ty,tz)  Tasks...\n" );
  printf ( "  Increment Formula\n" );
  printf ( "\n" );

  for ( blockIdz = 0; blockIdz < gridDimz; blockIdz++ )
  {
    for ( blockIdy = 0; blockIdy < gridDimy; blockIdy++ )
    {
      for ( blockIdx = 0; blockIdx < gridDimx; blockIdx++ )
      {
        for ( threadIdz = 0; threadIdz < blockDimz; threadIdz++ )
        {
          for ( threadIdy = 0; threadIdy < blockDimy; threadIdy++ )
          {
            for ( threadIdx = 0; threadIdx < blockDimx; threadIdx++ )
            {
              t = k1;
              k2 = 
                threadIdx
                + blockDimx * threadIdy
                + blockDimx * blockDimy * threadIdz
                + blockDimx * blockDimy * blockDimz * blockIdx
                + blockDimx * blockDimy * blockDimz * gridDimx * blockIdy
                + blockDimx * blockDimy * blockDimz * gridDimx * gridDimy * blockIdz;

              printf ( "  %7d  %7d: (%2d,%2d,%2d) (%2d,%2d,%2d)", 
                k1, k2, blockIdx, blockIdy, blockIdz, threadIdx, threadIdy, threadIdz );
              while ( t < n )
              {
                printf ( "%3d", t ); 
                t = t + chunk;
              }
              printf ( "\n" );
              k1 = k1 + 1;
            }
          }
        }
      }
    }
  }

  return;
}