Actual source code: ex1.raja.cxx

  1: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
  2: // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
  3: // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
  4: //
  5: // SPDX-License-Identifier: (BSD-3-Clause)
  6: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

  8: #include <cstdlib>
  9: #include <cstdio>
 10: #include <cstring>

 12: #include <iostream>
 13: #include <cmath>

 15: #include "RAJA/RAJA.hpp"

 17: #include "memoryManager.hpp"

 19: /*
 20:  * Jacobi Example
 21:  *
 22:  * ----[Details]--------------------
 23:  * This code uses a five point finite difference stencil
 24:  * to discretize the following boundary value problem
 25:  *
 26:  * U_xx + U_yy = f on [0,1] x [0,1].
 27:  *
 28:  * The right-hand side is chosen to be
 29:  * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
 30:  *
 31:  * A structured grid is used to discretize the domain
 32:  * [0,1] x [0,1]. Values inside the domain are computed
 33:  * using the Jacobi method to solve the associated
 34:  * linear system. The scheme is invoked until the l_2
 35:  * difference of subsequent iterations is below a
 36:  * tolerance.
 37:  *
 38:  * The scheme is implemented by allocating two arrays
 39:  * (I, Iold) and initialized to zero. The first set of
 40:  * nested for loops apply an iteration of the Jacobi
 41:  * scheme. The scheme is only applied to the interior
 42:  * nodes.
 43:  *
 44:  * The second set of nested for loops is used to
 45:  * update Iold and compute the l_2 norm of the
 46:  * difference of the iterates.
 47:  *
 48:  * Computing the l_2 norm requires a reduction operation.
 49:  * To simplify the reduction procedure, the RAJA API
 50:  * introduces thread safe variables.
 51:  *
 52:  * ----[RAJA Concepts]---------------
 53:  * - Forall::nested loop
 54:  * - RAJA Reduction
 55:  *
 56:  */

 58: /*
 59:  *  ----[Constant Values]-----
 60:  * CUDA_BLOCK_SIZE_X - Number of threads in the
 61:  *                     x-dimension of a cuda thread block
 62:  *
 63:  * CUDA_BLOCK_SIZE_Y - Number of threads in the
 64:  *                     y-dimension of a cuda thread block
 65:  *
 66:  * CUDA_BLOCK_SIZE   - Number of threads per threads block
 67: */
 68: #if defined(RAJA_ENABLE_CUDA)
 69: const int CUDA_BLOCK_SIZE = 256;
 70: #endif

 72: #if defined(RAJA_ENABLE_HIP)
 73: const int HIP_BLOCK_SIZE = 256;
 74: #endif

 76: //
 77: //  Struct to hold grid info
 78: //  o - Origin in a cartesian dimension
 79: //  h - Spacing between grid points
 80: //  n - Number of grid points
 81: //
 82: struct grid_s {
 83:   double o, h;
 84:   int n;
 85: };

 87: //
 88: // ----[Functions]---------
 89: // solution   - Function for the analytic solution
 90: // computeErr - Displays the maximum error in the solution
 91: //
 92: double solution(double x, double y);
 93: void computeErr(double *I, grid_s grid);

 95: int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 96: {

 98:   std::cout<<"Jacobi Example"<<std::endl;

100:   /*
101:    * ----[Solver Parameters]------------
102:    * tol       - Method terminates once the norm is less than tol
103:    * N         - Number of unknown gridpoints per cartesian dimension
104:    * NN        - Total number of gridpoints on the grid
105:    * maxIter   - Maximum number of iterations to be taken
106:    *
107:    * resI2     - Residual
108:    * iteration - Iteration number
109:    * grid_s    - Struct with grid information for a cartesian dimension
110:   */
111:   double tol = 1e-10;

113:   int N = 50;
114:   int NN = (N + 2) * (N + 2);
115:   int maxIter = 100000;

117:   double resI2;
118:   int iteration;

120:   grid_s gridx;
121:   gridx.o = 0.0;
122:   gridx.h = 1.0 / (N + 1.0);
123:   gridx.n = N + 2;

125:   //
126:   //I, Iold - Holds iterates of Jacobi method
127:   //
128:   double *I = memoryManager::allocate<double>(NN);
129:   double *Iold = memoryManager::allocate<double>(NN);

131:   memset(I, 0, NN * sizeof(double));
132:   memset(Iold, 0, NN * sizeof(double));

134:   printf("Standard  C++ Loop \n");
135:   resI2 = 1;
136:   iteration = 0;

138:   while (resI2 > tol * tol) {

140:     //
141:     // Jacobi Iteration
142:     //
143:     for (int n = 1; n <= N; ++n) {
144:       for (int m = 1; m <= N; ++m) {

146:         double x = gridx.o + m * gridx.h;
147:         double y = gridx.o + n * gridx.h;

149:         double f = gridx.h * gridx.h
150:                    * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

152:         int id = n * (N + 2) + m;
153:         I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
154:                            + Iold[id + 1]);
155:       }
156:     }

158:     //
159:     // Compute residual and update Iold
160:     //
161:     resI2 = 0.0;
162:     for (int k = 0; k < NN; k++) {
163:       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
164:       Iold[k] = I[k];
165:     }

167:     if (iteration > maxIter) {
168:       printf("Standard C++ Loop - Maxed out on iterations \n");
169:       exit(-1);
170:     }

172:     iteration++;
173:   }
174:   computeErr(I, gridx);
175:   printf("No of iterations: %d \n \n", iteration);

177:   //
178:   // RAJA loop calls may be shortened by predefining policies
179:   //
180:   RAJA::RangeSegment gridRange(0, NN);
181:   RAJA::RangeSegment jacobiRange(1, (N + 1));

183:   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
184:   RAJA::statement::For<1, RAJA::seq_exec,
185:     RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;

187:   printf("RAJA: Sequential Policy - Nested ForallN \n");
188:   resI2 = 1;
189:   iteration = 0;
190:   memset(I, 0, NN * sizeof(double));
191:   memset(Iold, 0, NN * sizeof(double));

193:   /*
194:    *  Sequential Jacobi Iteration.
195:    *
196:    *  Note that a RAJA ReduceSum object is used to accumulate the sum
197:    *  for the residual. Since the loop is run sequentially, this is
198:    *  not strictly necessary. It is done here for consistency and
199:    *  comparison with other RAJA variants in this example.
200:    */
201:   while (resI2 > tol * tol) {

203:     RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
204:                          [=] (RAJA::Index_type m, RAJA::Index_type n) {

206:           double x = gridx.o + m * gridx.h;
207:           double y = gridx.o + n * gridx.h;

209:           double f = gridx.h * gridx.h
210:                      * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

212:           int id = n * (N + 2) + m;
213:           I[id] =
214:                0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
215:                           + Iold[id + 1]);
216:         });

218:     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
219:     RAJA::forall<RAJA::seq_exec>(
220:       gridRange, [=](RAJA::Index_type k) {

222:         RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
223:         Iold[k] = I[k];

225:       });

227:     resI2 = RAJA_resI2;
228:     if (iteration > maxIter) {
229:       printf("Jacobi: Sequential - Maxed out on iterations! \n");
230:       exit(-1);
231:     }
232:     iteration++;
233:   }
234:   computeErr(I, gridx);
235:   printf("No of iterations: %d \n \n", iteration);

237: #if defined(RAJA_ENABLE_OPENMP)
238:   printf("RAJA: OpenMP Policy - Nested ForallN \n");
239:   resI2 = 1;
240:   iteration = 0;
241:   memset(I, 0, NN * sizeof(double));
242:   memset(Iold, 0, NN * sizeof(double));

244:   /*
245:    *  OpenMP parallel Jacobi Iteration.
246:    *
247:    *  ----[RAJA Policies]-----------
248:    *  RAJA::omp_collapse_for_exec -
249:    *  introduced a nested region
250:    *
251:    *  Note that OpenMP RAJA ReduceSum object performs the reduction
252:    *  operation for the residual in a thread-safe manner.
253:    */

255:   using jacobiOmpNestedPolicy = RAJA::KernelPolicy<
256:       RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
257:         RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;

259:   while (resI2 > tol * tol) {

261:     RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
262:                          [=] (RAJA::Index_type m, RAJA::Index_type n) {

264:       double x = gridx.o + m * gridx.h;
265:       double y = gridx.o + n * gridx.h;

267:       double f = gridx.h * gridx.h *
268:                  (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

270:       int id = n * (N + 2) + m;
271:       I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
272:                            Iold[id - 1] + Iold[id + 1]);
273:     });

275:     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);

277:     RAJA::forall<RAJA::omp_parallel_for_exec>( gridRange,
278:       [=](RAJA::Index_type k) {

280:       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
281:       Iold[k] = I[k];

283:     });

285:     resI2 = RAJA_resI2;
286:     if (iteration > maxIter) {
287:       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
288:       exit(-1);
289:     }
290:     iteration++;
291:   }
292:   computeErr(I, gridx);
293:   printf("No of iterations: %d \n \n", iteration);
294: #endif

296: #if defined(RAJA_ENABLE_CUDA)
297:   /*
298:    *  CUDA Jacobi Iteration.
299:    *
300:    *  ----[RAJA Policies]-----------
301:    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
302:    *  define the mapping of loop iterations to GPU thread blocks
303:    *
304:    *  Note that CUDA RAJA ReduceSum object performs the reduction
305:    *  operation for the residual in a thread-safe manner on the GPU.
306:    */

308:   printf("RAJA: CUDA Policy - Nested ForallN \n");

310:   using jacobiCUDANestedPolicy = RAJA::KernelPolicy<
311:     RAJA::statement::CudaKernel<
312:       RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
313:         RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
314:           RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
315:             RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
316:               RAJA::statement::Lambda<0>
317:             >
318:           >
319:         >
320:       >
321:     > >;

323:   resI2 = 1;
324:   iteration = 0;
325:   memset(I, 0, NN * sizeof(double));
326:   memset(Iold, 0, NN * sizeof(double));

328:   while (resI2 > tol * tol) {

330:     //
331:     // Jacobi Iteration
332:     //
333:     RAJA::kernel<jacobiCUDANestedPolicy>(
334:                          RAJA::make_tuple(jacobiRange,jacobiRange),
335:                          [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {

337:           double x = gridx.o + m * gridx.h;
338:           double y = gridx.o + n * gridx.h;

340:           double f = gridx.h * gridx.h
341:                      * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

343:           int id = n * (N + 2) + m;
344:           I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
345:                              + Iold[id + 1]);
346:         });

348:     //
349:     // Compute residual and update Iold
350:     //
351:     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
352:     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
353:       gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {

355:           RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
356:           Iold[k] = I[k];

358:       });

360:     resI2 = RAJA_resI2;

362:     if (iteration > maxIter) {
363:       printf("RAJA: CUDA - Maxed out on iterations! \n");
364:       exit(-1);
365:     }
366:     iteration++;
367:   }
368:   cudaDeviceSynchronize();
369:   computeErr(I, gridx);
370:   printf("No of iterations: %d \n \n", iteration);
371: #endif

373: #if defined(RAJA_ENABLE_HIP)
374:   /*
375:    *  HIP Jacobi Iteration.
376:    *
377:    *  ----[RAJA Policies]-----------
378:    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
379:    *  define the mapping of loop iterations to GPU thread blocks
380:    *
381:    *  Note that HIP RAJA ReduceSum object performs the reduction
382:    *  operation for the residual in a thread-safe manner on the GPU.
383:    */

385:   printf("RAJA: HIP Policy - Nested ForallN \n");

387:   using jacobiHIPNestedPolicy = RAJA::KernelPolicy<
388:     RAJA::statement::HipKernel<
389:       RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
390:         RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
391:           RAJA::statement::For<1, RAJA::hip_thread_y_direct,
392:             RAJA::statement::For<0, RAJA::hip_thread_x_direct,
393:               RAJA::statement::Lambda<0>
394:             >
395:           >
396:         >
397:       >
398:     > >;

400:   resI2 = 1;
401:   iteration = 0;
402:   memset(I, 0, NN * sizeof(double));
403:   memset(Iold, 0, NN * sizeof(double));

405:   double *d_I    = memoryManager::allocate_gpu<double>(NN);
406:   double *d_Iold = memoryManager::allocate_gpu<double>(NN);
407:   hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
408:   hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));

410:   while (resI2 > tol * tol) {

412:     //
413:     // Jacobi Iteration
414:     //
415:     RAJA::kernel<jacobiHIPNestedPolicy>(
416:                          RAJA::make_tuple(jacobiRange,jacobiRange),
417:                          [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {

419:           double x = gridx.o + m * gridx.h;
420:           double y = gridx.o + n * gridx.h;

422:           double f = gridx.h * gridx.h
423:                      * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

425:           int id = n * (N + 2) + m;
426:           d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1]
427:                              + d_Iold[id + 1]);
428:         });

430:     //
431:     // Compute residual and update Iold
432:     //
433:     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
434:     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
435:       gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {

437:           RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
438:           d_Iold[k] = d_I[k];

440:       });

442:     resI2 = RAJA_resI2;

444:     if (iteration > maxIter) {
445:       printf("RAJA: HIP - Maxed out on iterations! \n");
446:       exit(-1);
447:     }
448:     iteration++;
449:   }
450:   hipDeviceSynchronize();
451:   hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
452:   computeErr(I, gridx);
453:   printf("No of iterations: %d \n \n", iteration);

455:   memoryManager::deallocate_gpu(d_I);
456:   memoryManager::deallocate_gpu(d_Iold);
457: #endif

459:   memoryManager::deallocate(I);
460:   memoryManager::deallocate(Iold);

462:   return 0;
463: }

465: //
466: // Function for the anlytic solution
467: //
468: double solution(double x, double y)
469: {
470:   return x * y * exp(x - y) * (1 - x) * (1 - y);
471: }

473: //
474: // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
475: //
476: void computeErr(double *I, grid_s grid)
477: {

479:   RAJA::RangeSegment gridRange(0, grid.n);
480:   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);

482:   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
483:     RAJA::statement::For<1, RAJA::seq_exec,
484:       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;

486:   RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange,gridRange),
487:                        [=] (RAJA::Index_type ty, RAJA::Index_type tx) {

489:       int id = tx + grid.n * ty;
490:       double x = grid.o + tx * grid.h;
491:       double y = grid.o + ty * grid.h;
492:       double myErr = std::abs(I[id] - solution(x, y));
493:       tMax.max(myErr);
494:     });

496:   double l2err = tMax;
497:   printf("Max error = %lg, h = %f \n", l2err, grid.h);
498: }

500: /*TEST

502:     test:
503:       requires: raja !cuda

505:     test:
506:       suffix: 2
507:       requires: raja cuda

509: TEST*/