# include  <stdio.h> 

int main ( );
__global__ void sizes ( int n );

int main ( ) 
{
  int n; 

  printf ( "\n" );
  printf ( "SIZES:\n" );
  printf ( "  CUDA/C version.\n" );
  printf ( "  Make sense of CUDA block and thread data.\n" );
//
// Launch the kernel on the device.
//
  dim3 blocks ( 2, 3, 1 );
  dim3 threads ( 2, 1, 4 );
  n = 40;

  printf ( "\n" );
  printf ( "  BLOCKS: ( %d, %d, %d )\n", blocks.x, blocks.y, blocks.z );
  printf ( "  THREADS: ( %d, %d, %d )\n", threads.x, threads.y, threads.z );
  printf ( "  Tasks N: %d\n", n );

  printf ( "\n" );
  printf ( "gridDim.x, gridDim.y, gridDim.z, blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z, threadIdx.x, threadIdx.y, threadIdx.z\n" );
  printf ( "\n" );

  sizes <<< blocks, threads >>> ( n ); 
//
//  This call is required to flush the output printed by the GPU.
//
  cudaDeviceSynchronize ( );
//
//  Terminate.
//
  printf ( "\n" );
  printf ( "SIZES:\n" );
  printf ( "  Normal end of execution.\n" );

  return 0;
}

__global__ void sizes ( int n ) 
{
//
//  Purpose:
//
//    SIZES reports on the values of various indices and dimensions.
//
  int chunk;
  int k;
  int u = threadIdx.x;
  int v = threadIdx.y;
  int w = threadIdx.z;
  int x = blockIdx.x;
  int y = blockIdx.y;
  int z = blockIdx.z;
  int U = blockDim.x;
  int V = blockDim.y;
  int W = blockDim.z;
  int X = gridDim.x;
  int Y = gridDim.y;
  int Z = gridDim.z;

  k = u
    + U * v
    + U * V * w
    + U * V * W * x
    + U * V * W * X * y
    + U * V * W * X * Y * z;

  chunk = U * V * W * X * Y * Z;

  printf ( "%d  %d  %d  %d  %d  %d  %d  %d  %d  %d  %d  %d  %d  %d\n", 
    k, chunk,
    gridDim.x, gridDim.y, gridDim.z,
    blockIdx.x, blockIdx.y, blockIdx.z,
    blockDim.x, blockDim.y, blockDim.z,
    threadIdx.x, threadIdx.y, threadIdx.z );
/*
  k = ;
  t = k;
  while ( t < n )
  {
    print
    t = t + ?;
  }
*/
  return;
}