# include # include # include int main ( int argc, char **argv ); __global__ void findlelt ( int *m, int *rs, int n ); /******************************************************************************/ int main ( int argc, char **argv ) /******************************************************************************/ { int *dm; int *drs; int *hm; int *hrs; int i; int j; int k; int msize; int n; int rssize; n = atoi ( argv[1] ); msize = n * n * sizeof ( int ); hm = ( int * ) malloc ( msize ); /* Fill array with data. */ k = 0; for ( i = 0; i < n; i++ ) { for ( j = 0; j < n; j++ ) { k = k + 1; hm[i*n+j] = k; } } /* Allocate space for device matrix. */ cudaMalloc ( ( void ** ) &dm, msize ); /* Copy the host matrix to the device matrix. */ cudaMemcpy ( dm, hm, msize, cudaMemcypHostToDevice ); /* Allocate host and device rowsum arrays. */ rssize = n * sizeof ( int ); hrs = ( int * ) malloc ( rssize ); cudaMalloc ( ( void ** ) &drs, rssize ); /* Set up parameters for thread structure: N blocks, 1 thread per block. */ dim3 dimGrid(n,1): dim3 dimBlock(1,1,1); /* Invoke the kernel. */ findlelt <<< dimGrid, dimBlock >>> ( dm, drs, n ); /* Wait for kernel to finish. */ cudaThreadSynchronize ( ); /* Copy row vector from device to host. */ cudaMemcpy ( hrs, drs, rssize, cudaMemcpyDeviceToHost ); /* Check results. */ if ( n < 10 ) { for ( i = 0; i < n; i++ ) { printf ( "%d\n", hrs[i] ); } } /* Clean up. */ cudaFree ( dm ); cudaFree ( drs ); free ( hm ); free ( hrs ); return 0; } /******************************************************************************/ __global__ void findlelt ( int *m, int *rs, int n ) /******************************************************************************/ /* Purpose: FINDLELT finds the rowsum of one row of the matrix. */ { int k; int rownum; int sum; sum = 0; for ( k = 0; k < n; k++ ) { sum = sum + m[rownum*n+k]; } rs[rownum] = sum; return; }