# include int main ( ); __global__ void say_hello ( ); int main ( ) { int blocks; int threads; printf ( "\n" ); printf ( "hello():\n" ); printf ( " Hello, world from the CUDA GPU.\n" ); printf ( " CUDA/C version.\n" ); // // Run the kernel with 2 blocks and 5 threads each. // blocks = 2; threads = 5; say_hello <<< blocks, threads >>> ( ); // // Flush the output buffer // (we have to do this if we print from the GPU). // cudaDeviceSynchronize ( ); // // Terminate. // printf ( "\n" ); printf ( "hello():\n" ); printf ( " Normal end of execution.\n" ); return 0; } __global__ void say_hello ( ) // { int i = threadIdx.x; int j = blockIdx.x; printf ( "Hello from CUDA thread %d, block %d\n", i, j ); return; }