# include int main ( ); void loadArrays ( float *a_cpu, float *b_cpu, int n ); __global__ void add_vectors ( float *a_gpu, float *b_gpu, float *c_gpu ); int main ( ) { float *a_cpu; float *a_gpu; float *b_cpu; float *b_gpu; float *c_cpu; float *c_gpu; int i; int memsize; int n; printf ( "\n" ); printf ( "VECADD:\n" ); printf ( " CUDA/C version.\n" ); printf ( " Transfer vectors A and B to GPU, compute sum C.\n" ); // // Set size of vectors; // n = 10; // // Allocate A and B on the CPU. // memsize = n * sizeof ( float ); a_cpu = ( float * ) malloc ( memsize ); b_cpu = ( float * ) malloc ( memsize ); // // Load A and B with random floats. // loadArrays ( a_cpu, b_cpu, n ); // // Allocate memory on the GPU. // cudaMalloc ( ( void** ) &a_gpu, memsize ); cudaMalloc ( ( void** ) &b_gpu, memsize ); cudaMalloc ( ( void** ) &c_gpu, memsize ); // // Copy a_gpu <= a_cpu, from host to device. // cudaMemcpy ( a_gpu, a_cpu, memsize, cudaMemcpyHostToDevice ); // // Copy b_gpu <= b_cpu, from host to device. // cudaMemcpy ( b_gpu, b_cpu, memsize, cudaMemcpyHostToDevice ); // // Launch the kernel on the device. // int blocks = 1; int threads = n; add_vectors <<< blocks, threads >>> ( a_gpu, b_gpu, c_gpu ); // // Copy c_cpu <= c_gpu from device to host. // c_cpu = ( float * ) malloc ( memsize ); cudaMemcpy ( c_cpu, c_gpu, memsize, cudaMemcpyDeviceToHost ); // // Print the results. // printf ( "\n" ); printf ( " i A[i] B[i] C[i]=A[i]+B[i]\n" ); printf ( "\n" ); for ( i = 0; i < n; i++ ) { printf ( " %2d %8.2f %8.2f %8.2f\n", i, a_cpu[i], b_cpu[i], c_cpu[i] ); } // // Free CPU memory. // free ( a_cpu ); free ( b_cpu ); free ( c_cpu ); // // Free GPU memory. // cudaFree ( a_gpu ); cudaFree ( b_gpu ); cudaFree ( c_gpu ); // // Terminate. // printf ( "\n" ); printf ( "VECADD:\n" ); printf ( " Normal end of execution.\n" ); return 0; } void loadArrays ( float *a_cpu, float *b_cpu, int n ) { int i; srand ( 1 ); for ( i = 0; i < n; i++ ) { a_cpu[i] = ( float ) rand ( ) / ( ( float ) RAND_MAX / 100 ); b_cpu[i] = ( float ) rand ( ) / ( ( float ) RAND_MAX / 100 ); } return; } __global__ void add_vectors ( float *a_gpu, float *b_gpu, float *c_gpu ) { // // This kernel assumes BLOCKS and THREADS are scalars! // int k = threadIdx.x + blockDim.x * blockIdx.x; c_gpu[k] = a_gpu[k] + b_gpu[k]; return; }