#include <stdio.h>
#include "cuda.h"

void checkCUDAError(const char *msg)
{
  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err) 
    {
      fprintf(stderr, 
	      "Message: %s posted CUDA error: %s \n",
	      msg, cudaGetErrorString( err ) );
    }                         
}

void myCudaMalloc(void **d_v, int sz, char *mess){
  
  int err = cudaMalloc(d_v, sz);

  if(err!=0){
    printf("CUDA Warning: %s gives code %d\n", mess, err);
    checkCUDAError(mess);
  }
  
}

void myCudaDeviceToHost(void *h_v, void *d_v, int sz, char *mess){

  int err = cudaMemcpy(h_v, d_v, sz, cudaMemcpyDeviceToHost);

  if(err!=0){
    printf("cudaMemcpy Warning: %s gives code %d\n", mess, err);
    checkCUDAError(mess);
  }

}

void myCudaHostToDevice(void *d_v, void *h_v, int sz, char *mess){

  int err = cudaMemcpy(d_v, h_v, sz, cudaMemcpyHostToDevice);

  if(err!=0){
    printf("cudaMemcpy Warning: %s gives code %d\n", mess, err);
    checkCUDAError(mess);
  }

}


/* cuda Kernel to scale a vector */
__global__ void mandelbrot(const int N, float *d_cre, const float *d_cim){
  
#define Nloops 100
#define escape 4.f

  int n = blockIdx.x*blockDim.x + threadIdx.x;
  int loop;
      
  if(n>N)return;

  const float cre = d_cre[n];
  const float cim = d_cim[n];

  int count = 0;
 
  float zre = 0.f, zim = 0.f;
  for(loop=0;loop<Nloops;++loop){
  
    const float new_zre =    zre*zre - zim*zim + cre;
    const float new_zim =          2.f*zre*zim + cim;
    
    zre = new_zre;
    zim = new_zim;

    count = count + (zre*zre+zim*zim>escape);

  }
 
  d_cre[n] = count;
}

void StartKernelTiming(cudaEvent_t& tic, cudaEvent_t& toc, cudaStream_t iStream)
{
  cudaEventCreate(&tic); 
  cudaEventCreate(&toc);
  cudaEventRecord(tic, iStream);
}

void StopKernelTiming(cudaEvent_t& tic, cudaEvent_t& toc, cudaStream_t iStream, float* ptimer)
//---------------------------------------------------------
{
  float kt;
  cudaEventRecord(toc, iStream);
  cudaEventSynchronize(toc);
  cudaEventElapsedTime(&kt, tic, toc);
  cudaEventDestroy(tic); cudaEventDestroy(toc);
  (*ptimer) += kt;
}


main(){

  int Nre = 16000;
  int Nim = 16000;

  int N = Nre*Nim;
    
  int Nthreads = 512;
  int Nblocks = (int)(N+Nthreads-1)/Nthreads;

  /* dimension of block grid */
  dim3 dimBlockGrid(Nblocks);

  /* dimension of thread block */
  dim3 dimThreadBlock(Nthreads);

  /* host arrays */
  float *h_cre = (float*) calloc(N, sizeof(float));
  float *h_cim = (float*) calloc(N, sizeof(float));

  /* fill up host array */
  int n,m,sk=0;
  double remin = -2, remax = 2.;
  double immin = -2, immax = 2.;
  for(n=0;n<Nre;++n){
    for(m=0;m<Nim;++m){
      h_cre[sk] = remin + (remax-remin)*n/(Nre-1.);
      h_cim[sk] = immin + (immax-immin)*m/(Nim-1.);
      ++sk;
    }
  }

  /* device arrays */
  float *d_cre;
  float *d_cim;

  /* allocate device arrays */
  int sz = N*sizeof(float);
  myCudaMalloc( (void**) &d_cre, sz, "allocate d_cre");
  myCudaMalloc( (void**) &d_cim, sz, "allocate d_cim");

  myCudaHostToDevice( d_cre, h_cre, sz, "copy cre to device");
  myCudaHostToDevice( d_cim, h_cim, sz, "copy cim to device");

  /* start timer */
  cudaEvent_t tic, toc;
  StartKernelTiming(tic, toc, 0);

  /* request to invoke kernel */
  mandelbrot <<< dimBlockGrid, dimThreadBlock >>> ( N, d_cre, d_cim);

  /* stop timer */
  float elapsed_time = 0.f;
  StopKernelTiming(tic,toc, 0, &elapsed_time);

  /* convert from miliseconds to seconds */
  elapsed_time /= 1000.f;

  /* output elapsed time */
  printf("elapsed time for kernel: %f\n", elapsed_time);

  float *h_counts = (float*) calloc(N, sizeof(float));
  myCudaDeviceToHost( h_counts, d_cre, sz, "copy result to host");

#if 0
  /* output */
  FILE *fp = fopen("mandel.dat", "w");
  sk = 0;
  for(n=0;n<Nre;++n){
    for(m=0;m<Nim;++m){
      fprintf(fp, "%g %g %d\n", h_cre[sk], h_cim[sk], (int)h_counts[sk]);
      ++sk;
    }
  }
  fclose(fp);
#endif

}