# include <stdio.h> 
# include <stdlib.h>

int main ( );
__global__ void jacobi ( float *b_gpu, float *x1_gpu, float *x2_gpu, int n );

/******************************************************************************/

int main ( ) 

/******************************************************************************/
/*
  Purpose:

    CUDA program for the Jacobi iterative solution of A*x=b.

  Modified:

    29 May 2017

  Author:

    John Burkardt
*/
{
  float *b_cpu;
  float *b_gpu;
  int blocks;
  clock_t clock1;
  clock_t clock2;
  float e;
  float e_norm;
  float *x_cpu;
  float *x_exact;
  float *x1_gpu;
  float *x2_gpu;
  int i;
  int it;
  int memsize;
  int n = 10000; 
  int threads;
  double value;

  printf ( "\n" );
  printf ( "JACOBI:\n" );
  printf ( "  CUDA/C version.\n" );
  printf ( "  Apply Jacobi's iterative method to solve A*x=b\n" );
  printf ( "  System size is N = %d\n", n );
/*
  Choose number of blocks and threads.
  Note that max threads is probably 1024.
*/
  if ( n <= 32 )
  {
    blocks = 1;
    threads = n;
  }
  else
  {
    threads = 32;
    blocks = ( n - 1 ) / threads + 1;
  }
  printf ( "  Blocks = %d, threads = %d, B*T=%d\n", 
    blocks, threads, blocks * threads );
/*
  Set the vector data size in bytes.
*/
  memsize = n * sizeof ( float );
/*
  Set the exact solution X.
*/
  x_exact = ( float * ) malloc ( memsize );
  for ( i = 0; i < n; i++ )
  {
    x_exact[i] = ( float ) rand ( ) / ( float ) RAND_MAX;
  }
/*
  Compute B based on the exact X.
*/
  b_cpu = ( float * ) malloc ( memsize );
  b_cpu[0] = 2.0 * x_exact[0] - x_exact[1];
  for ( i = 1; i < n - 1; i++ )
  {
    b_cpu[i] = - x_exact[i-1] + 2.0 * x_exact[i] - x_exact[i+1];
  }
  b_cpu[n-1] = - x_exact[n-2] + 2.0 * x_exact[n-1];
/*
  Initialize the approximate solution.
*/
  x_cpu = ( float * ) malloc ( memsize );
  for ( i = 0; i < n; i++ )
  {
    x_cpu[i] = 0.0;
  }
/*
  Allocate B, X1, X2 on the GPU.
*/
  cudaMalloc ( ( void** ) &b_gpu, memsize ); 
  cudaMalloc ( ( void** ) &x1_gpu, memsize );
  cudaMalloc ( ( void** ) &x2_gpu, memsize ); 
/*
  Copy b_gpu <= b_cpu, from host to device.
  Copy x1_gpu <= x_cpu, from host to device.
  Copy x2_gpu <= x_cpu, from host to device.
*/
  cudaMemcpy ( b_gpu, b_cpu, memsize, cudaMemcpyHostToDevice );
  cudaMemcpy ( x1_gpu, x_cpu, memsize, cudaMemcpyHostToDevice );
/*
  Launch the kernel on the device.
  Use X1 to update X2, then X2 to update X1.
  On return, latest solution estimate is in X1.
*/
  clock1 = clock ( );

  for ( it = 0; it < 10 * n; it++ )
  {
    jacobi <<< threads, blocks >>> ( b_gpu, x1_gpu, x2_gpu, n ); 
    jacobi <<< threads, blocks >>> ( b_gpu, x2_gpu, x1_gpu, n ); 
  }
/*
  Copy x_cpu <= x1_gpu from device to host.
*/
  cudaMemcpy ( x_cpu, x1_gpu, memsize, cudaMemcpyDeviceToHost ); 

  clock2 = clock ( );
  value = ( double ) ( clock2 - clock1 ) / ( double ) CLOCKS_PER_SEC;
/*
  Compute the residual norm.
*/
  e_norm = 0.0;
  for ( i = 0; i < n; i++ )
  {
    e = 2.0 * x_cpu[i] - b_cpu[i];
    if ( 0 < i )
    {
      e = e - x_cpu[i-1];
    }
    if ( i < n - 1 )
    {
      e = e - x_cpu[i+1];
    }
    e_norm = e_norm + e * e;
  }
  e_norm = sqrt ( e_norm / ( float ) ( n ) );
  printf ( "\n" );
  printf ( "  RMS error ||A*x-b|| = %g\n", e_norm );
/*
  Compute the approximation error norm.
*/
  e_norm = 0.0;
  for ( i = 0; i < n; i++ )
  {
    e_norm = e_norm + pow ( x_exact[i] - x_cpu[i], 2 );
  }
  e_norm = sqrt ( e_norm / ( float ) ( n ) );
  printf ( "  RMS error ||x(exact) - x(computed)|| = %g\n", e_norm );
  printf ( "  Computation required %g seconds\n", value );
/*
  Print no more than 10 entries of the data.
*/
  printf ( "\n" );
  printf ( "     I    B[i]    X[i]    Xexact[i]  (Xexact-X)[i]  (A*X-B)[I]\n" );
  printf ( "\n" );

  for ( i = 0; i < n && i < 10; i++ )
  {
    e = 2.0 * x_cpu[i] - b_cpu[i];
    if ( 0 < i )
    {
      e = e - x_cpu[i-1];
    }
    if ( i < n - 1 )
    {
      e = e - x_cpu[i+1];
    }
    printf ( "  %4d  %8.4f  %8.4f  %8.4f  %8.2e  %8.2e\n", 
      i, b_cpu[i], x_cpu[i], x_exact[i], x_exact[i] - x_cpu[i], e );
  }
/*
  Free CPU memory.
*/
  free ( b_cpu );
  free ( x_cpu );
  free ( x_exact );
/* 
  Free GPU memory.
*/
  cudaFree ( b_gpu ); 
  cudaFree ( x1_gpu ); 
  cudaFree ( x2_gpu );
/*
  Reset GPU.
*/
  cudaDeviceReset ( );
/*
  Terminate.
*/
  printf ( "\n" );
  printf ( "JACOBI:\n" );
  printf ( "  Normal end of execution.\n" );

  return 0;
}
/******************************************************************************/

__global__ void jacobi ( float *b_gpu, float *x1_gpu, float *x2_gpu, int n ) 

/******************************************************************************/
{
/*
  Purpose:

    JACOBI carries out one thread of one step of Jacobi iteration.

  Discussion:

    Specifically, for 0 < i < N - 1
      - x(i-1) + 2 x(i) - x(i+1) = b(i) 
    is rewritten
      x2(i) = ( b(i) + x(i-1) + x(i+1) ) / 2
    for i = 0, we have
      x2(i) = ( b(i) + x(i+1) ) / 2
    and for i = n - 1 we have
      x2(i) = ( b(i) + x(i-1) ) / 2

  Modified:

    29 May 2017

  Author:

    John Burkardt
*/
  int k = threadIdx.x + blockDim.x * blockIdx.x;
  float xim1;
  float xip1;

  if ( k < n )
  {
    if ( 0 < k )
    {
      xim1 = x1_gpu[k-1];
    }
    else
    {
      xim1 = 0.0;
    }
    if ( k < n - 1 )
    { 
      xip1 = x1_gpu[k+1];
    }
    else
    {
      xip1 = 0.0;
    }
    x2_gpu[k] = 0.5 * ( b_gpu[k] + xim1 + xip1 );
  }

  return;
}