# include <stdio.h> 

int main ( );
__global__ void say_hello ( );

int main ( ) 
{
  int blocks;
  int threads;

  printf ( "\n" );
  printf ( "hello():\n" );
  printf ( "  Hello, world from the CUDA GPU.\n" );
  printf ( "  CUDA/C version.\n" );
//
//  Run the kernel with 2 blocks and 5 threads each.
// 
  blocks = 2;
  threads = 5;
  say_hello <<< blocks, threads >>> ( ); 
//
//  Flush the output buffer 
//  (we have to do this if we print from the GPU).
//
  cudaDeviceSynchronize ( );
//
//  Terminate.
//
  printf ( "\n" );
  printf ( "hello():\n" );
  printf ( "  Normal end of execution.\n" );

  return 0;
}

__global__ void say_hello ( ) 
//
{
  int i = threadIdx.x; 
  int j = blockIdx.x;

  printf ( "Hello from CUDA thread %d, block %d\n", i, j );

  return;
}