# include  <stdio.h> 

int main ( );
__global__ void laplace ( float *t1_gpu, float *t2_gpu, int m, int n );
__device__ float bc_gpu ( float x, float y );
__device__ float rhs_gpu ( float x, float y );

/******************************************************************************/

int main ( )

/******************************************************************************/
/*
  Purpose:

    laplace_solve() is a CUDA program for the iterative solution of the Laplace equation.

  Discussion:

    The region is the rectangle [0.0,2.0] x [0.0, 1.0].

    A grid of 41 points in the X direction and 21 in the Y direction is used.

    If the temperatures are stored in a 21x41 array T, then the shape
    of the array will correspond exactly to the shape of the physical grid
    if we set, for i = 0...m-1, j = 0...n-1:
      x(j) =      j /(n-1)
      y(i) = (m-1-i)/(m-1)

    We want to solve the discretized heat equation:

      ( 4*t(i,j) - t(i-1,j) - t(i+1,j) - t(i,j-1) - t(i,j+1) ) / delta^2 = f(x,y)

    where delta is the uniform spacing, with boundary conditions

      t(0.0, *) =  10.0
      t( *,0.0) =   0.0
      t( *,1.0) =   0.0
      t(2.0, *) = 100.0

    We use 100 steps of Jacobi iteration.

  Modified:

    12 March 2017
*/
{
  int i;
  int j;
  int it;
  int m;
  int memsize;
  int n;
  float *t_cpu;
  float *t1_gpu;
  float *t2_gpu;
  float x;
  float y;

  printf ( "\n" );
  printf ( "laplace_solve():\n" );
  printf ( "  CUDA/C version.\n" );
  printf ( "  Solve the Laplace equation on an MxN grid.\n" );
/*
  Set cell dimensions;
*/
  m = 21;
  n = 41;
/*
  Allocate T on the CPU.
*/
  memsize = m * n * sizeof ( float );
  t_cpu = ( float * ) malloc ( memsize );
/*
  Initialize T.
*/
  for ( j = 0; j < n; j++ )
  {
    for ( i = 0; i < m; i++ )
    {
      t_cpu[i+j*m] = 0.0;
    }
  }
/*
  Allocate T1, T2 on the GPU.
*/
  cudaMalloc ( ( void** ) &t1_gpu, memsize );
  cudaMalloc ( ( void** ) &t2_gpu, memsize ); 
/*
  Copy x1_gpu <= t_cpu, from host to device.
*/
  cudaMemcpy ( t1_gpu, t_cpu, memsize, cudaMemcpyHostToDevice );
/*
  Launch the kernel on the device.
  Use T1 to update T2, then T2 to update T1.
  On return, latest solution estimate is in T1.
*/
  dim3 thread_grid ( m, n, 1 );

  for ( it = 0; it < 100; it++ )
  {
    laplace <<< 1, thread_grid >>> ( t1_gpu, t2_gpu, m, n ); 
    laplace <<< 1, thread_grid >>> ( t2_gpu, t1_gpu, m, n ); 
  }
/*
  Copy t_cpu <= t1_gpu from device to host.
*/
  cudaMemcpy ( t_cpu, t1_gpu, memsize, cudaMemcpyDeviceToHost ); 
/*
  Print solution.
*/
  for ( j = 0; j < n; j++ )
  {
    x = 0.0 + 2.0 * float (         j ) / float ( n - 1 );
    for ( i = 0; i < m; i++ )
    {
      y = 0.0 + 1.0 * float ( m - 1 - i ) / float ( m - 1 );
      printf ( "  %8.4f  %8.4f  %14.6g\n", x, y, t_cpu[i+j*m] );
/*
      printf ( "  %3d  %3d  %8.4f  %8.4f  %14.6g\n", i, j, x, y, t_cpu[i+j*m] );
*/
    }
    printf ( "\n" );
  }
/*
  Free CPU memory.
*/
  free ( t_cpu );
/* 
  Free GPU memory.
*/
  cudaFree ( t1_gpu ); 
  cudaFree ( t2_gpu );
/*
  Terminate.
*/
  printf ( "\n" );
  printf ( "laplace_solve():\n" );
  printf ( "  Normal end of execution.\n" );

  return 0;
}
/******************************************************************************/

__global__ void laplace ( float *t1_gpu, float *t2_gpu, int m, int n ) 

/******************************************************************************/
{
/*
  Purpose:

    laplace(): one thread of one step of Jacobi iteration for Laplace's equation.

  Discussion:

    The region is the rectangle [0.0,2.0]x[0.0,1.0],

  Modified:

    12 March 2017
*/
  float delta;
  int i = threadIdx.x; 
  int j = threadIdx.y;
  float tim1;
  float tip1;
  float tjm1;
  float tjp1;
  float x;
  float y;

  x = 0.0 + 2.0 * float (         j ) / float ( n - 1 );
  y = 0.0 + 1.0 * float ( m - 1 - i ) / float ( m - 1 );

  if ( i == 0 || i == m - 1 || j == 0 || j == n - 1 )
  {
    t2_gpu[i+j*m] = bc_gpu ( x, y );
  }
  else
  {
    tim1 = t1_gpu[i-1+j*m];
    tip1 = t1_gpu[i+1+j*m];
    tjm1 = t1_gpu[i+(j-1)*m];
    tjp1 = t1_gpu[i+(j+1)*m];
    delta = 1.0 / ( float ( m - 1 ) );
    t2_gpu[i+j*m] = 0.25 * ( rhs_gpu ( x, y ) * delta * delta 
      + tim1 + tip1 + tjm1 + tjp1 );
  }

  return;
}
/******************************************************************************/

__device__ float bc_gpu ( float x, float y )

/******************************************************************************/
{
  float eps = 0.000001;
  float value;

  if ( fabs ( y ) <= eps || fabs ( y - 1.0 ) <= eps )
  {
    value = 0.0;
  }
  else if ( fabs ( x ) <= eps )
  {
    value = 10.0;
  }
  else if ( fabs ( x - 2.0 ) <= eps )
  {
    value = 100.0;
  }
  else
  {
    value = 0.0;
  }

  return value;
}
/******************************************************************************/

__device__ float rhs_gpu ( float x, float y )

/******************************************************************************/
{
  float value = 0.0;

  return value;
}