# include int main ( ); __global__ void say_hello ( ); int main ( ) { int n; printf ( "\n" ); printf ( "HELLO:\n" ); printf ( " CUDA version.\n" ); // // Launch the kernel with 10 threads. // n = 10; say_hello <<< 1, n >>> ( ); // // The GPU print statements aren't guaranteed to appear unless you // force the CPU to synchronize (wait for) the GPU. // cudaDeviceSynchronize ( ); // // Terminate. // printf ( "\n" ); printf ( "HELLO:\n" ); printf ( " Normal end of execution.\n" ); return 0; } __global__ void say_hello ( ) // // To print from a CUDA kernel requires a GPU with compute // capability of 2.0 or higher. // { int i = blockIdx.x; int j = threadIdx.x; printf ( "Hello from CUDA block %d, thread %d\n", i, j ); return; }