# include int main ( ); __global__ void say_hello ( ); int main ( ) { int blocks; int threads; printf ( "\n" ); printf ( "HELLO:\n" ); printf ( " Hello, world from the GPU.\n" ); printf ( " CUDA/C version.\n" ); // // Run the kernel with 2 blocks and 5 threads each. // blocks = 2; threads = 5; say_hello <<< blocks, threads >>> ( ); // // Flush the output buffer // (we have to do this if we print from the GPU). // cudaDeviceSynchronize ( ); // // Terminate. // printf ( "\n" ); printf ( "HELLO:\n" ); printf ( " Normal end of execution.\n" ); return 0; } __global__ void say_hello ( ) // { int i = threadIdx.x; int j = blockIdx.x; int k = threadIdx.x + blockDim.x * blockIdx.x; printf ( "Hello #%d from CUDA block %d, thread %d\n", k, j, i ); return; }