# include  <stdio.h> 

int main ( );
__global__ void say_hello ( );

int main ( ) 
{
  int n;

  printf ( "\n" );
  printf ( "HELLO:\n" );
  printf ( "  CUDA version.\n" );
//
// Launch the kernel with 10 threads.
// 
  n = 10;
  say_hello <<< 1, n >>> ( ); 
//
//  The GPU print statements aren't guaranteed to appear unless you
//  force the CPU to synchronize (wait for) the GPU.
//
  cudaDeviceSynchronize ( );
//
//  Terminate.
//
  printf ( "\n" );
  printf ( "HELLO:\n" );
  printf ( "  Normal end of execution.\n" );

  return 0;
}

__global__ void say_hello ( ) 
//
//  To print from a CUDA kernel requires a GPU with compute
//  capability of 2.0 or higher.
//
{
  int i = blockIdx.x;
  int j = threadIdx.x; 

  printf ( "Hello from CUDA block %d, thread %d\n", i, j );

  return;
}