# include int main ( ); void loadArrays ( int n, float *a_cpu, float *b_cpu ); __global__ void add_vectors ( float *a_gpu, float *b_gpu, float *c_gpu ); int main ( ) { float *a_cpu; float *a_gpu; float *b_cpu; float *b_gpu; float *c_cpu; float *c_gpu; int i; int memsize; int n; printf ( "\n" ); printf ( "VECADD:\n" ); printf ( " CUDA/C version.\n" ); printf ( " Transfer vectors A and B to GPU, compute sum C.\n" ); // // Set size of vectors; // n = 10; // // Allocate A and B on the CPU. // memsize = n * sizeof ( float ); a_cpu = ( float * ) malloc ( memsize ); b_cpu = ( float * ) malloc ( memsize ); // // Load A and B with random floats. // loadArrays ( n, a_cpu, b_cpu ); // // Allocate memory on the GPU. // cudaMalloc ( ( void** ) &a_gpu, memsize ); cudaMalloc ( ( void** ) &b_gpu, memsize ); cudaMalloc ( ( void** ) &c_gpu, memsize ); // // Copy a_gpu <= a_cpu, from host to device. // cudaMemcpy ( a_gpu, a_cpu, memsize, cudaMemcpyHostToDevice ); // // Copy b_gpu <= b_cpu, from host to device. // cudaMemcpy ( b_gpu, b_cpu, memsize, cudaMemcpyHostToDevice ); // // Launch the kernel on the device. // add_vectors <<< 1, n >>> ( a_gpu, b_gpu, c_gpu ); // // Copy c_cpu <= c_gpu from device to host. // c_cpu = ( float * ) malloc ( memsize ); cudaMemcpy ( c_cpu, c_gpu, memsize, cudaMemcpyDeviceToHost ); // // Print the results. // printf ( "\n" ); printf ( " i A[i] B[i] C[i]=A[i]+B[i]\n" ); printf ( "\n" ); for ( i = 0; i < n && i < 10; i++ ) { printf ( " %2d %8.2f %8.2f %8.2f\n", i, a_cpu[i], b_cpu[i], c_cpu[i] ); } // // Free CPU memory. // free ( a_cpu ); free ( b_cpu ); free ( c_cpu ); // // Free GPU memory. // cudaFree ( a_gpu ); cudaFree ( b_gpu ); cudaFree ( c_gpu ); // // Terminate. // printf ( "\n" ); printf ( "VECADD:\n" ); printf ( " Normal end of execution.\n" ); return 0; } void loadArrays ( int n, float *a_cpu, float *b_cpu ) { int i; srand ( 1 ); for ( i = 0; i < n; i++ ) { a_cpu[i] = ( float ) rand ( ) / ( ( float ) RAND_MAX / 100 ); b_cpu[i] = ( float ) rand ( ) / ( ( float ) RAND_MAX / 100 ); } return; } __global__ void add_vectors ( float *a_gpu, float *b_gpu, float *c_gpu ) { // // threadIdx.x is a built-in variable provided by CUDA at runtime. // int i = threadIdx.x; c_gpu[i] = a_gpu[i] + b_gpu[i]; return; }