# include <stdio.h>
# include <stdlib.h>

# include "cuda_loop.h"

int main ( );

/******************************************************************************/

int main ( )

/******************************************************************************/
/*
  Purpose:

    CUDA_LOOP_TEST demonstrates CUDA_LOOP.

  Discussion:

    A CUDA kernel "kernel()" is invoked by a command of the form
      
      kernel << blocks, threads >> ( args )

    where blocks and threads are each vectors of up to 3 values,
    listing the number of blocks and number of threads to be used.

    If a problem involves N tasks, then tasks are allotted to 
    specific CUDA processes in an organized fashion.  Some processes
    may get no tasks, one task, or multiple tasks.  

    Each process is given variables that can be used to determine
    the tasks to be performed:

      gridDim.x, gridDim.y, gridDim.z: the block dimensions as
      given by the user in "blocks";

      blockDim.x, blockDim.y, blockDim.z: the thread dimensions as
      given by the user in "threads";

      blockIdx.x, blockIdx.y, blockId.z: the block indices for this process.

      threadIdx.x, threadIdx.y, threadIdx.z: the thread indices for this process.

    Essentially, a process can determine its linear index K by:

      K = threadIdx.x
        +  blockdim.x  * threadIdx.y
        +  blockDim.x  *  blockDim.y  * threadIdx.z
        +  blockDim.x  *  blockDim.y  *  blockDim.z  * blockIdx.x
        +  blockDim.x  *  blockDim.y  *  blockDim.z  *  gridDim.x  * blockIdx.y
        +  blockDim.x  *  blockDim.y  *  blockDim.z  *  gridDim.x  *  gridDim.y  * blockIdx.z

    Set task T = K.

    while ( T < N )
      carry out task T;
      T = T + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z.

    This program suggests how a specific set of block and thread parameters 
    would determine the assignment of individual tasks to CUDA processes.

  Licensing:

    This code is distributed under the MIT license. 

  Modified:

    22 March 2017

  Author:

    John Burkardt

  Parameters:

    Local, int BLOCKS[3], the CUDA block values.  These should be nonnegative.
    Typically, the third entry is 1.  Generally, the first two values cannot
    be greater than 35,535.

    Local, int THREADS[3], the CUDA thread values.  These should be nonnegative.
    Typically, there is a maximum value imposed on these quantities, which
    depends on the GPU model.

    Local, int N, the number of tasks to be carried out.
*/
{
  int blocks[3];
  int n;
  int threads[3];

  timestamp ( );
  printf ( "\n" );
  printf ( "CUDA_LOOP_TEST:\n" );
  printf ( "  C version\n" );
  printf ( "  Simulate the way CUDA breaks into iterative task, using\n" );
  printf ( "  blocks and threads.\n" );
/*
  Linear array of blocks and threads.
  Essentially, blocks = your hands and threads = your fingers.
  Now count up to 23..
*/
  blocks[0] = 2;
  blocks[1] = 1;
  blocks[2] = 1;
  threads[0] = 5;
  threads[1] = 1;
  threads[2] = 1;
  n = 23;
 
  cuda_loop ( blocks, threads, n );
/*
  Unit arrays of blocks and threads.
  Waste your GPU by having a single block and thread do everything.
*/
  blocks[0] = 1;
  blocks[1] = 1;
  blocks[2] = 1;
  threads[0] = 1;
  threads[1] = 1;
  threads[2] = 1;
  n = 23;

  cuda_loop ( blocks, threads, n );
/*
  2D block array, 3D thread array.
  More processes than tasks.
*/
  blocks[0] = 2;
  blocks[1] = 3;
  blocks[2] = 1;
  threads[0] = 2;
  threads[1] = 1;
  threads[2] = 4;
  n = 40;
 
  cuda_loop ( blocks, threads, n );
/*
  One block, 8 threads.
*/ 
  blocks[0] = 1;
  blocks[1] = 1;
  blocks[2] = 1;
  threads[0] = 2;
  threads[1] = 2;
  threads[2] = 2;
  n = 23;
 
  cuda_loop ( blocks, threads, n );
/*
  Terminate.
*/
  printf ( "\n" );
  printf ( "CUDA_LOOP_TEST:\n" );
  printf ( "  Normal end of execution.\n" );
  printf ( "\n" );
  timestamp ( );

  return 0;
}