# include  <stdio.h> 

int main ( );
__global__ void say_hello ( );

int main ( ) 
{
  int blocks;
  int threads;

  printf ( "\n" );
  printf ( "HELLO:\n" );
  printf ( "  Hello, world from the GPU.\n" );
  printf ( "  CUDA/C version.\n" );
//
//  Run the kernel with 2 blocks and 5 threads each.
// 
  blocks = 2;
  threads = 5;
  say_hello <<< blocks, threads >>> ( ); 
//
//  Flush the output buffer 
//  (we have to do this if we print from the GPU).
//
  cudaDeviceSynchronize ( );
//
//  Terminate.
//
  printf ( "\n" );
  printf ( "HELLO:\n" );
  printf ( "  Normal end of execution.\n" );

  return 0;
}

__global__ void say_hello ( ) 
//
{
  int i = threadIdx.x; 
  int j = blockIdx.x;
  int k = threadIdx.x + blockDim.x * blockIdx.x;

  printf ( "Hello #%d from CUDA block %d, thread %d\n", k, j, i );

  return;
}