#include #include "cuda.h" void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { fprintf(stderr, "Message: %s posted CUDA error: %s \n", msg, cudaGetErrorString( err ) ); } } void myCudaMalloc(void **d_v, int sz, char *mess){ int err = cudaMalloc(d_v, sz); if(err!=0){ printf("CUDA Warning: %s gives code %d\n", mess, err); checkCUDAError(mess); } } void myCudaDeviceToHost(void *h_v, void *d_v, int sz, char *mess){ int err = cudaMemcpy(h_v, d_v, sz, cudaMemcpyDeviceToHost); if(err!=0){ printf("cudaMemcpy Warning: %s gives code %d\n", mess, err); checkCUDAError(mess); } } void myCudaHostToDevice(void *d_v, void *h_v, int sz, char *mess){ int err = cudaMemcpy(d_v, h_v, sz, cudaMemcpyHostToDevice); if(err!=0){ printf("cudaMemcpy Warning: %s gives code %d\n", mess, err); checkCUDAError(mess); } } /* cuda Kernel to scale a vector */ __global__ void mandelbrot(const int N, float *d_cre, const float *d_cim){ #define Nloops 100 #define escape 4.f int n = blockIdx.x*blockDim.x + threadIdx.x; int loop; if(n>N)return; const float cre = d_cre[n]; const float cim = d_cim[n]; int count = 0; float zre = 0.f, zim = 0.f; for(loop=0;loopescape); } d_cre[n] = count; } void StartKernelTiming(cudaEvent_t& tic, cudaEvent_t& toc, cudaStream_t iStream) { cudaEventCreate(&tic); cudaEventCreate(&toc); cudaEventRecord(tic, iStream); } void StopKernelTiming(cudaEvent_t& tic, cudaEvent_t& toc, cudaStream_t iStream, float* ptimer) //--------------------------------------------------------- { float kt; cudaEventRecord(toc, iStream); cudaEventSynchronize(toc); cudaEventElapsedTime(&kt, tic, toc); cudaEventDestroy(tic); cudaEventDestroy(toc); (*ptimer) += kt; } main(){ int Nre = 16000; int Nim = 16000; int N = Nre*Nim; int Nthreads = 512; int Nblocks = (int)(N+Nthreads-1)/Nthreads; /* dimension of block grid */ dim3 dimBlockGrid(Nblocks); /* dimension of thread block */ dim3 dimThreadBlock(Nthreads); /* host arrays */ float *h_cre = (float*) calloc(N, sizeof(float)); float *h_cim = (float*) calloc(N, sizeof(float)); /* fill up host array */ int n,m,sk=0; double remin = -2, remax = 2.; double immin = -2, immax = 2.; for(n=0;n>> ( N, d_cre, d_cim); /* stop timer */ float elapsed_time = 0.f; StopKernelTiming(tic,toc, 0, &elapsed_time); /* convert from miliseconds to seconds */ elapsed_time /= 1000.f; /* output elapsed time */ printf("elapsed time for kernel: %f\n", elapsed_time); float *h_counts = (float*) calloc(N, sizeof(float)); myCudaDeviceToHost( h_counts, d_cre, sz, "copy result to host"); #if 0 /* output */ FILE *fp = fopen("mandel.dat", "w"); sk = 0; for(n=0;n