From 0780c15590302ec1197cfdcbe13d669a271a7df0 Mon Sep 17 00:00:00 2001 From: john bowen Date: Tue, 8 Oct 2024 15:06:02 -0700 Subject: [PATCH 1/5] Add clang-format on/off annotations using new script --- examples/dynamic-forall.cpp | 2 + examples/dynamic_mat_transpose.cpp | 6 ++ examples/forall-param-reductions.cpp | 12 +++ examples/forall_multi-reductions.cpp | 4 + examples/jacobi.cpp | 14 +++ examples/kernel-dynamic-tile.cpp | 2 + examples/launch-param-reductions.cpp | 10 ++ examples/launch_flatten.cpp | 4 + examples/launch_matrix-multiply.cpp | 27 +++++ examples/launch_reductions.cpp | 2 + examples/memoryManager.hpp | 1 + examples/omp-target-kernel.cpp | 5 + examples/pi-reduce_vs_atomic.cpp | 8 ++ examples/raja-launch.cpp | 4 + examples/red-black-gauss-seidel.cpp | 1 + examples/resource-dynamic-forall.cpp | 2 + examples/resource-forall.cpp | 4 + examples/resource-kernel.cpp | 4 + examples/resource-launch.cpp | 2 + examples/resource-runtime-launch.cpp | 2 + examples/tut_daxpy.cpp | 4 + examples/tut_halo-exchange.cpp | 12 +++ examples/tut_launch_basic.cpp | 2 + examples/tut_matrix-multiply.cpp | 81 ++++++++++++++ examples/wave-eqn.cpp | 11 ++ scripts/clang-format-on-off.py | 151 +++++++++++++++++++++++++++ 26 files changed, 377 insertions(+) create mode 100644 scripts/clang-format-on-off.py diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp index 5131010bd6..8cab9dddca 100644 --- a/examples/dynamic-forall.cpp +++ b/examples/dynamic-forall.cpp @@ -28,6 +28,7 @@ void checkResult(int* res, int len); void printResult(int* res, int len); +// clang-format off using policy_list = camp::list; +// clang-format on int main(int argc, char *argv[]) { diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp index feb5247224..4f1ae550b5 100644 --- a/examples/dynamic_mat_transpose.cpp +++ b/examples/dynamic_mat_transpose.cpp @@ -59,6 +59,7 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c); template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format off using launch_policy = RAJA::LaunchPolicy< #if defined(RAJA_ENABLE_OPENMP) RAJA::omp_launch_t @@ -79,6 +80,7 @@ using launch_policy = RAJA::LaunchPolicy< #endif >; +// clang-format on /* * Define team policies. * Up to 3 dimension are supported: x,y,z @@ -322,6 +324,7 @@ int main(int argc, char *argv[]) // _dynamic_mattranspose_shared_mem_end // _dynamic_mattranspose_kernel_start +// clang-format off RAJA::launch (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr), RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size), @@ -378,6 +381,7 @@ int main(int argc, char *argv[]) }); }); // _dynamic_mattranspose_kernel_end +// clang-format on #if defined(RAJA_GPU_ACTIVE) if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { @@ -414,6 +418,7 @@ int main(int argc, char *argv[]) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -432,6 +437,7 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp index 2305f74c2b..dddd2d2459 100644 --- a/examples/forall-param-reductions.cpp +++ b/examples/forall-param-reductions.cpp @@ -134,6 +134,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type seq_minloc2(-1); RAJA::Index_type seq_maxloc2(-1); +// clang-format off RAJA::forall(host_res, arange, RAJA::expt::Reduce(&seq_sum), RAJA::expt::Reduce(&seq_min), @@ -164,6 +165,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << seq_sum << std::endl; std::cout << "\tmin = " << seq_min << std::endl; std::cout << "\tmax = " << seq_max << std::endl; @@ -198,6 +200,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type omp_minloc2(-1); RAJA::Index_type omp_maxloc2(-1); +// clang-format off RAJA::forall(host_res, arange, RAJA::expt::Reduce(&omp_sum), RAJA::expt::Reduce(&omp_min), @@ -228,6 +231,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << omp_sum << std::endl; std::cout << "\tmin = " << omp_min << std::endl; std::cout << "\tmax = " << omp_max << std::endl; @@ -264,6 +268,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type omp_t_minloc2(-1); RAJA::Index_type omp_t_maxloc2(-1); +// clang-format off RAJA::forall(omp_res, arange, RAJA::expt::Reduce(&omp_t_sum), RAJA::expt::Reduce(&omp_t_min), @@ -294,6 +299,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << omp_t_sum << std::endl; std::cout << "\tmin = " << omp_t_min << std::endl; std::cout << "\tmax = " << omp_t_max << std::endl; @@ -334,6 +340,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type cuda_minloc2(-1); RAJA::Index_type cuda_maxloc2(-1); +// clang-format off RAJA::forall(cuda_res, arange, RAJA::expt::Reduce(&cuda_sum), RAJA::expt::Reduce(&cuda_min), @@ -364,6 +371,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << cuda_sum << std::endl; std::cout << "\tmin = " << cuda_min << std::endl; std::cout << "\tmax = " << cuda_max << std::endl; @@ -403,6 +411,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type hip_minloc2(-1); RAJA::Index_type hip_maxloc2(-1); +// clang-format off RAJA::forall(hip_res, arange, RAJA::expt::Reduce(&hip_sum), RAJA::expt::Reduce(&hip_min), @@ -433,6 +442,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << hip_sum << std::endl; std::cout << "\tmin = " << hip_min << std::endl; std::cout << "\tmax = " << hip_max << std::endl; @@ -473,6 +483,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type sycl_minloc2(-1); RAJA::Index_type sycl_maxloc2(-1); +// clang-format off RAJA::forall(sycl_res, arange, RAJA::expt::Reduce(&sycl_sum), RAJA::expt::Reduce(&sycl_min), @@ -503,6 +514,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << sycl_sum << std::endl; std::cout << "\tmin = " << sycl_min << std::endl; std::cout << "\tmax = " << sycl_max << std::endl; diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp index c3be312194..8dad6d8ab5 100644 --- a/examples/forall_multi-reductions.cpp +++ b/examples/forall_multi-reductions.cpp @@ -36,6 +36,7 @@ struct Backend std::string name; }; +// clang-format off auto example_policies = camp::make_tuple( Backend{"Sequential"} @@ -54,6 +55,7 @@ auto example_policies = camp::make_tuple( ); +// clang-format on template < typename exec_policy, typename multi_reduce_policy > void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a) { @@ -131,6 +133,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) //----------------------------------------------------------------------------// +// clang-format off RAJA::for_each_tuple(example_policies, [&](auto const& backend) { std::cout << "Running " << backend.name << " policies" << '\n'; @@ -154,6 +157,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) std::cout << std::endl; }); +// clang-format on //----------------------------------------------------------------------------// // diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp index 0badaa7396..e86632b427 100644 --- a/examples/jacobi.cpp +++ b/examples/jacobi.cpp @@ -184,10 +184,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::RangeSegment gridRange(0, NN); RAJA::RangeSegment jacobiRange(1, (N + 1)); +// clang-format off using jacobiSeqNestedPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; +// clang-format on printf("RAJA: Sequential Policy - Nested ForallN \n"); resI2 = 1; iteration = 0; @@ -257,10 +259,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) * operation for the residual in a thread-safe manner. */ +// clang-format off using jacobiOmpNestedPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; +// clang-format on while (resI2 > tol * tol) { RAJA::kernel(RAJA::make_tuple(jacobiRange,jacobiRange), @@ -315,6 +319,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("RAJA: CUDA Policy - Nested ForallN \n"); +// clang-format off using jacobiCUDANestedPolicy = RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, @@ -328,6 +333,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > > >; +// clang-format on resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); @@ -357,6 +363,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Compute residual and update Iold // RAJA::ReduceSum RAJA_resI2(0.0); +// clang-format off RAJA::forall>( gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { @@ -365,6 +372,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on resI2 = RAJA_resI2; if (iteration > maxIter) { @@ -392,6 +400,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("RAJA: HIP Policy - Nested ForallN \n"); +// clang-format off using jacobiHIPNestedPolicy = RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, @@ -405,6 +414,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > > >; +// clang-format on resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); @@ -439,6 +449,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Compute residual and update Iold // RAJA::ReduceSum RAJA_resI2(0.0); +// clang-format off RAJA::forall>( gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { @@ -447,6 +458,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on resI2 = RAJA_resI2; if (iteration > maxIter) { @@ -488,10 +500,12 @@ void computeErr(double *I, grid_s grid) RAJA::RangeSegment gridRange(0, grid.n); RAJA::ReduceMax tMax(-1.0); +// clang-format off using jacobiSeqNestedPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(gridRange,gridRange), [=] (RAJA::Index_type ty, RAJA::Index_type tx ) { diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp index 5de2123425..80cec6cbdb 100644 --- a/examples/kernel-dynamic-tile.cpp +++ b/examples/kernel-dynamic-tile.cpp @@ -14,6 +14,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using namespace RAJA; +// clang-format off kernel_param< KernelPolicy< statement::Tile<1, tile_dynamic<1>, seq_exec, @@ -31,4 +32,5 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "Running index (" << i << "," << j << ") of " << x.size << "x" << y.size << " tile." << std::endl; }); +// clang-format on } diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp index 5bec907c33..6280963c6f 100644 --- a/examples/launch-param-reductions.cpp +++ b/examples/launch-param-reductions.cpp @@ -150,6 +150,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type seq_minloc2(-1); RAJA::Index_type seq_maxloc2(-1); +// clang-format off RAJA::launch (host_res, RAJA::LaunchParams(), "SeqReductionKernel", RAJA::expt::Reduce(&seq_sum), @@ -186,6 +187,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << seq_sum << std::endl; std::cout << "\tmin = " << seq_min << std::endl; std::cout << "\tmax = " << seq_max << std::endl; @@ -217,6 +219,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type omp_minloc2(-1); RAJA::Index_type omp_maxloc2(-1); +// clang-format off RAJA::launch (host_res, RAJA::LaunchParams(), "OmpReductionKernel", RAJA::expt::Reduce(&omp_sum), @@ -253,6 +256,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << omp_sum << std::endl; std::cout << "\tmin = " << omp_min << std::endl; std::cout << "\tmax = " << omp_max << std::endl; @@ -289,6 +293,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type cuda_minloc2(-1); RAJA::Index_type cuda_maxloc2(-1); +// clang-format off RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)), "CUDAReductionKernel", @@ -329,6 +334,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << cuda_sum << std::endl; std::cout << "\tmin = " << cuda_min << std::endl; std::cout << "\tmax = " << cuda_max << std::endl; @@ -366,6 +372,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type hip_minloc2(-1); RAJA::Index_type hip_maxloc2(-1); +// clang-format off RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)), "HipReductionKernel", @@ -403,6 +410,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << hip_sum << std::endl; std::cout << "\tmin = " << hip_min << std::endl; std::cout << "\tmax = " << hip_max << std::endl; @@ -440,6 +448,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type sycl_minloc2(-1); RAJA::Index_type sycl_maxloc2(-1); +// clang-format off RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)), "SyclReductionKernel", @@ -477,6 +486,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on std::cout << "\tsum = " << sycl_sum << std::endl; std::cout << "\tmin = " << sycl_min << std::endl; std::cout << "\tmax = " << sycl_max << std::endl; diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp index 2a3d92ad84..e56461506d 100644 --- a/examples/launch_flatten.cpp +++ b/examples/launch_flatten.cpp @@ -97,6 +97,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_A_2DView(d_A_ptr, N, N); RAJA::View> d_A_1DView(d_A_ptr, NN); +// clang-format off RAJA::launch (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -117,6 +118,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on //----------------------------------------------------------------------------// std::cout << "\n Running host version of teams_flatten example ...\n"; @@ -125,6 +127,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> h_A_2DView(h_A_ptr, N, N); RAJA::View> h_A_1DView(h_A_ptr, NN); +// clang-format off RAJA::launch (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -145,6 +148,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on if ( device_kernel_sum.get() == host_kernel_sum.get() ) { std::cout << "\n\t result -- PASS\n"; } else { diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp index 797c5ee7c5..bccf7ccd87 100644 --- a/examples/launch_matrix-multiply.cpp +++ b/examples/launch_matrix-multiply.cpp @@ -37,6 +37,7 @@ /* * Define host/device launch policies */ +// clang-format off using launch_policy = RAJA::LaunchPolicy< RAJA::seq_launch_t #if defined(RAJA_ENABLE_CUDA) @@ -49,6 +50,7 @@ using launch_policy = RAJA::LaunchPolicy< #endif >; +// clang-format on using loop_policy = RAJA::seq_exec; #if defined(RAJA_ENABLE_CUDA) @@ -188,18 +190,22 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) template void checkResult(T *C, int N); +// clang-format off template void checkResult(RAJA::View> Cview, int N); +// clang-format on // // Functions for printing results // template void printResult(T *C, int N); +// clang-format off template void printResult(RAJA::View> Cview, int N); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -314,6 +320,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //two for loops. // _matmult_basickernel_start +// clang-format off RAJA::launch(RAJA::ExecPlace::HOST, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -332,6 +339,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_basickernel_end +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -355,6 +363,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using omp_row_policy0 = RAJA::LoopPolicy; +// clang-format off RAJA::launch(RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -371,6 +380,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -387,6 +397,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using global_thread_xy = RAJA::LoopPolicy; +// clang-format off RAJA::launch(RAJA::ExecPlace::HOST, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -403,6 +414,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); #endif // if RAJA_ENABLE_OPENMP @@ -425,6 +437,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // +// clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)), @@ -443,6 +456,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -461,6 +475,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The tiling capabilities in RAJA will also mask out of bounds iterations. // +// clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -486,6 +501,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -521,6 +537,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // +// clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)), @@ -540,6 +557,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); +// clang-format on hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); checkResult(Cview, N); //printResult(Cview, N); @@ -561,6 +579,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The tiling capabilities in RAJA will also mask out of bounds iterations. // +// clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -586,6 +605,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); +// clang-format on hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); checkResult(Cview, N); //printResult(Cview, N); @@ -610,6 +630,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This example also uses the teamSync() method in the launch context // to add a barrier ensuring all threads have loaded/read from shared memory // +// clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -672,6 +693,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // kernel +// clang-format on checkResult(Cview, N); //printResult(Cview, N); #endif @@ -772,6 +794,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Functions to check result and report P/F. // +// clang-format off template void checkResult(T* C, int N) { @@ -790,6 +813,8 @@ void checkResult(T* C, int N) } }; +// clang-format on +// clang-format off template void checkResult(RAJA::View> Cview, int N) { @@ -808,9 +833,11 @@ void checkResult(RAJA::View> Cview, int N) } }; +// clang-format on // // Functions to print result. // +// clang-format off template void printResult(T* C, int N) { diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp index 24e313e649..ba611f7782 100644 --- a/examples/launch_reductions.cpp +++ b/examples/launch_reductions.cpp @@ -149,6 +149,7 @@ int main(int argc, char *argv[]) const int TEAM_SZ = 256; const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ); +// clang-format off RAJA::launch (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), @@ -170,6 +171,7 @@ int main(int argc, char *argv[]) }); +// clang-format on std::cout << "\tsum = " << kernel_sum.get() << std::endl; std::cout << "\tmin = " << kernel_min.get() << std::endl; std::cout << "\tmax = " << kernel_max.get() << std::endl; diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp index 62d3d6e3e7..960142a83b 100644 --- a/examples/memoryManager.hpp +++ b/examples/memoryManager.hpp @@ -31,6 +31,7 @@ namespace memoryManager static camp::resources::Resource* sycl_res; #endif +// clang-format off template T *allocate(RAJA::Index_type size) { diff --git a/examples/omp-target-kernel.cpp b/examples/omp-target-kernel.cpp index ce425e07a6..8351d7f388 100644 --- a/examples/omp-target-kernel.cpp +++ b/examples/omp-target-kernel.cpp @@ -12,13 +12,16 @@ using namespace RAJA::statement; int main(int /*argc*/, char** /*argv[]*/) { +// clang-format off // using Pol = KernelPolicy< // For<1, RAJA::seq_exec>, // For<0, RAJA::omp_target_parallel_for_exec<1>, Lambda<0> > // >; using Pol = KernelPolicy< +// clang-format on Collapse, Lambda<0> > >; +// clang-format on double* array = new double[25*25]; #pragma omp target enter data map(to: array[0:25*25]) @@ -35,10 +38,12 @@ int main(int /*argc*/, char** /*argv[]*/) { //array[0] = i*j; }); #else +// clang-format off RAJA::forall>( RAJA::RangeSegment(0,25), [=] (int i) { // }); #endif +// clang-format on } diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp index ea0c18611f..9531bce111 100644 --- a/examples/pi-reduce_vs_atomic.cpp +++ b/examples/pi-reduce_vs_atomic.cpp @@ -102,12 +102,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; +// clang-format off RAJA::forall(bins, [=](int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; +// clang-format on std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; @@ -140,12 +142,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; +// clang-format off RAJA::forall(bins, [=](int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; +// clang-format on std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; @@ -180,11 +184,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; +// clang-format off RAJA::forall(bins, [=] RAJA_DEVICE (int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; +// clang-format on std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; @@ -219,11 +225,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using ATOMIC_POL4 = RAJA::hip_atomic; +// clang-format off RAJA::forall(bins, [=] RAJA_DEVICE (int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(d_atomic_pi, dx / (1.0 + x * x)); }); +// clang-format on hipErrchk(hipMemcpy( atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost )); *atomic_pi *= 4.0; std::cout << "\tpi = " << std::setprecision(prec) diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp index b2642e16ff..80cc251b92 100644 --- a/examples/raja-launch.cpp +++ b/examples/raja-launch.cpp @@ -34,6 +34,7 @@ /* * Define host/device launch policies */ +// clang-format off using launch_policy = RAJA::LaunchPolicy< #if defined(RAJA_ENABLE_OPENMP) RAJA::omp_launch_t @@ -50,6 +51,7 @@ using launch_policy = RAJA::LaunchPolicy< #endif >; +// clang-format on /* * Define team policies. * Up to 3 dimension are supported: x,y,z @@ -150,6 +152,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> D(Ddat, N_tri, N_tri); +// clang-format off RAJA::launch (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), @@ -175,6 +178,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // outer lambda +// clang-format on if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { host_res.deallocate(Ddat); } diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp index cfe74dc58a..4f784b841d 100644 --- a/examples/red-black-gauss-seidel.cpp +++ b/examples/red-black-gauss-seidel.cpp @@ -174,6 +174,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to generate RAJA ListSegments and populate a RAJA Static Index // Set. +// clang-format off RAJA::TypedIndexSet gsColorPolicy(int N, camp::resources::Resource res) { diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp index 0b35017fac..27656435ac 100644 --- a/examples/resource-dynamic-forall.cpp +++ b/examples/resource-dynamic-forall.cpp @@ -28,6 +28,7 @@ void checkResult(int* res, int len); void printResult(int* res, int len); +// clang-format off using policy_list = camp::list; +// clang-format on int main(int argc, char *argv[]) { diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp index b374bdba3f..7373a5d6fb 100644 --- a/examples/resource-forall.cpp +++ b/examples/resource-forall.cpp @@ -128,11 +128,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA omp_parallel_for_static_exec (default chunksize) vector addition...\n"; +// clang-format off RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; }); +// clang-format on checkResult(c, N); //----------------------------------------------------------------------------// @@ -141,11 +143,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector addition...\n"; +// clang-format off RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; }); +// clang-format on checkResult(c, N); #endif diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp index a754876479..64ed16b710 100644 --- a/examples/resource-kernel.cpp +++ b/examples/resource-kernel.cpp @@ -29,6 +29,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::RangeSegment n_range(0, N); using TEST_POL = +// clang-format off RAJA::KernelPolicy< statement::CudaKernelAsync< statement::For<0, cuda_block_x_loop, @@ -39,6 +40,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::forall(def_host_res, n_range, [=, &def_cuda_res](int i){ RAJA::resources::Cuda res_cuda; @@ -58,6 +61,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); int ec_count = 0; diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp index 288b70f8a5..ac161c1e75 100644 --- a/examples/resource-launch.cpp +++ b/examples/resource-launch.cpp @@ -34,6 +34,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using threads_x = RAJA::LoopPolicy; +// clang-format off RAJA::forall(def_host_res, n_range, [=, &def_cuda_res](int i){ @@ -59,6 +60,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); int ec_count = 0; diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp index e52923d81f..07b4ea51a0 100644 --- a/examples/resource-runtime-launch.cpp +++ b/examples/resource-runtime-launch.cpp @@ -160,6 +160,7 @@ int main(int argc, char *argv[]) #endif //How the kernel executes now depends on how the resource is constructed (host or device) +// clang-format off RAJA::launch (res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)), @@ -176,6 +177,7 @@ int main(int argc, char *argv[]) }); }); +// clang-format on std::cout << "\tsum = " << kernel_sum.get() << std::endl; std::cout << "\tmin = " << kernel_min.get() << std::endl; diff --git a/examples/tut_daxpy.cpp b/examples/tut_daxpy.cpp index 74b127e0d6..d9f1815261 100644 --- a/examples/tut_daxpy.cpp +++ b/examples/tut_daxpy.cpp @@ -154,11 +154,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); +// clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { a[i] += b[i] * c; }); +// clang-format on cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost )); cudaErrchk(cudaFree(a)); @@ -184,11 +186,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice )); +// clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { a[i] += b[i] * c; }); +// clang-format on hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost )); hipErrchk(hipFree(a)); diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp index c584695128..7c7eb19832 100644 --- a/examples/tut_halo-exchange.cpp +++ b/examples/tut_halo-exchange.cpp @@ -56,18 +56,23 @@ const int num_neighbors = 26; // // Functions for checking and printing results // +// clang-format off void checkResult(std::vector const& vars, std::vector const& vars_ref, int var_size, int num_vars); void printResult(std::vector const& vars, int var_size, int num_vars); +// clang-format on // // Functions for allocating and populating packing and unpacking lists // +// clang-format off void create_pack_lists(std::vector& pack_index_lists, std::vector& pack_index_list_lengths, const int halo_width, const int* grid_dims); void create_unpack_lists(std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, +// clang-format on const int halo_width, const int* grid_dims); void destroy_pack_lists(std::vector& pack_index_lists); +// clang-format on void destroy_unpack_lists(std::vector& unpack_index_lists); @@ -79,6 +84,7 @@ struct memory_manager_allocator memory_manager_allocator() = default; template < typename U > +// clang-format off constexpr memory_manager_allocator(memory_manager_allocator const&) noexcept { } @@ -110,7 +116,9 @@ bool operator==(memory_manager_allocator const&, memory_manager_allocator { return true; } +// clang-format on +// clang-format off template bool operator!=(memory_manager_allocator const& lhs, memory_manager_allocator const& rhs) { @@ -161,6 +169,8 @@ struct pinned_allocator } }; +// clang-format on +// clang-format off template bool operator==(pinned_allocator const&, pinned_allocator const&) { @@ -1787,9 +1797,11 @@ struct Extent int k_max; }; +// clang-format on // // Function to generate index lists for packing. // +// clang-format off void create_pack_lists(std::vector& pack_index_lists, std::vector& pack_index_list_lengths, const int halo_width, const int* grid_dims) diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp index 96a2ffe2f0..9f6b6c63ff 100644 --- a/examples/tut_launch_basic.cpp +++ b/examples/tut_launch_basic.cpp @@ -175,6 +175,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) const int Nthreads = 2; // __compute_grid_end +// clang-format off RAJA::launch(select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams), RAJA::Threads(Nthreads,Nthreads)), @@ -200,6 +201,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) }); +// clang-format on //Equivalent C style loops if(select_cpu_or_gpu == RAJA::ExecPlace::HOST) { // _c_style_loops_start diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp index e939d96dbb..08325513d2 100644 --- a/examples/tut_matrix-multiply.cpp +++ b/examples/tut_matrix-multiply.cpp @@ -81,18 +81,22 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) template void checkResult(T *C, int N); +// clang-format off template void checkResult(RAJA::View> Cview, int N); +// clang-format on // // Functions for printing results // template void printResult(T *C, int N); +// clang-format off template void printResult(RAJA::View> Cview, int N); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -193,6 +197,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N*N * sizeof(double)); // _matmult_outerforall_start +// clang-format off RAJA::forall( row_range, [=](int row) { for (int col = 0; col < N; ++col) { @@ -207,6 +212,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_outerforall_end +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -230,6 +236,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N*N * sizeof(double)); // _matmult_nestedforall_start +// clang-format off RAJA::forall( row_range, [=](int row) { RAJA::forall( col_range, [=](int col) { @@ -244,6 +251,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_nestedforall_end +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -283,6 +291,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_basickernel_start using EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col @@ -291,6 +300,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -302,6 +313,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_basickernel_end +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -316,6 +328,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_ompkernel_start using EXEC_POL1 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col @@ -324,7 +337,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_ompkernel_end +// clang-format on +// clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -336,6 +351,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -354,6 +370,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _matmult_ompkernel_swap_start using EXEC_POL2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::seq_exec, // col RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row @@ -362,7 +379,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_ompkernel_swap_end +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -374,6 +393,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -389,6 +409,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // outer loop with a 'collapse(2) clause. // using EXEC_POL3 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col @@ -396,6 +417,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -407,6 +430,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); checkResult(Cview, N); +// clang-format on //printResult(Cview, N); #endif // if RAJA_ENABLE_OPENMP @@ -430,6 +454,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // using EXEC_POL4 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, @@ -440,6 +465,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -451,6 +478,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); checkResult(Cview, N); +// clang-format on //printResult(Cview, N); @@ -470,6 +498,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // same as in this kernel and the one above. // using EXEC_POL5 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, @@ -484,6 +513,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -495,6 +526,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); checkResult(Cview, N); +// clang-format on //printResult(Cview, N); #endif // if RAJA_ENABLE_CUDA @@ -530,6 +562,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // using EXEC_POL4 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, @@ -540,6 +573,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -552,6 +587,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -573,6 +609,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // same as in this kernel and the one above. // using EXEC_POL5 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -589,6 +626,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -601,6 +640,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); #endif // if RAJA_ENABLE_HIP @@ -633,6 +673,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _matmult_3lambdakernel_seq_start using EXEC_POL6a = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -645,6 +686,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -667,6 +710,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); // _matmult_3lambdakernel_seq_end +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -689,6 +733,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using RAJA::Params; using EXEC_POL6b = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -701,6 +746,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -723,6 +770,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); // _matmult_3lambdakernel_args_seq_end +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -737,6 +785,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_3lambdakernel_ompcollapse_start using EXEC_POL7 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col @@ -748,7 +797,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_ompcollapse_end +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -771,6 +822,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); #endif // if RAJA_ENABLE_OPENMP @@ -785,6 +837,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_3lambdakernel_cuda_start using EXEC_POL8 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, // row @@ -799,7 +852,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_cuda_end +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -822,6 +877,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -833,6 +889,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_3lambdakernel_cudatiled_start using EXEC_POL9a = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -853,7 +910,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_cudatiled_end +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -876,6 +935,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -886,6 +946,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N*N * sizeof(double)); using EXEC_POL9b = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -906,6 +967,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -928,6 +991,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); @@ -954,6 +1018,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>; using EXEC_POL10 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernelFixed//Cuda kernel >; +// clang-format on Shmem aShared, bShared, cShared; +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -1053,6 +1120,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Cview, N); //printResult(Cview, N); #endif // if RAJA_ENABLE_CUDA @@ -1095,6 +1163,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_3lambdakernel_hip_start using EXEC_POL8 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, // row @@ -1110,7 +1179,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_hip_end +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -1133,6 +1204,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); +// clang-format on hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); checkResult(Cview, N); //printResult(Cview, N); @@ -1147,6 +1219,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _matmult_3lambdakernel_hiptiled_start using EXEC_POL9b = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -1167,7 +1240,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_hiptiled_end +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -1190,6 +1265,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); +// clang-format on hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); checkResult(Cview, N); //printResult(Cview, N); @@ -1240,6 +1316,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Functions to check result and report P/F. // +// clang-format off template void checkResult(T* C, int N) { @@ -1258,6 +1335,8 @@ void checkResult(T* C, int N) } }; +// clang-format on +// clang-format off template void checkResult(RAJA::View> Cview, int N) { @@ -1276,9 +1355,11 @@ void checkResult(RAJA::View> Cview, int N) } }; +// clang-format on // // Functions to print result. // +// clang-format off template void printResult(T* C, int N) { diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp index e3b83480ee..80d994ca99 100644 --- a/examples/wave-eqn.cpp +++ b/examples/wave-eqn.cpp @@ -123,17 +123,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Sequential policy +// clang-format off using fdPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; +// clang-format on // OpenMP policy +// clang-format off //using fdPolicy = RAJA::KernelPolicy< //RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; +// clang-format on // CUDA policy //using fdPolicy = +// clang-format off //RAJA::KernelPolicy< // RAJA::statement::CudaKernel< // RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct, @@ -148,6 +153,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // > // >; +// clang-format on time = 0; setIC(P1, P2, (time - dt), time, grid); @@ -191,10 +197,12 @@ void computeErr(double *P, double tf, grid_s grid) RAJA::RangeSegment fdBounds(0, grid.nx); RAJA::ReduceMax tMax(-1.0); +// clang-format off using initialPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec , RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), [=] (RAJA::Index_type tx, RAJA::Index_type ty) { @@ -222,10 +230,12 @@ void setIC(double *P1, double *P2, double t0, double t1, grid_s grid) RAJA::RangeSegment fdBounds(0, grid.nx); +// clang-format off using initialPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), [=] (RAJA::Index_type tx, RAJA::Index_type ty) { @@ -240,6 +250,7 @@ void setIC(double *P1, double *P2, double t0, double t1, grid_s grid) +// clang-format off template void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx) { diff --git a/scripts/clang-format-on-off.py b/scripts/clang-format-on-off.py new file mode 100644 index 0000000000..6d7c0453da --- /dev/null +++ b/scripts/clang-format-on-off.py @@ -0,0 +1,151 @@ +import string +import re +import os +from argparse import ArgumentParser + +# Parse a list of strings (that is assumed to be valid cpp code). Identify all nested +# templated type definitions of depth greater than cutoff. Return a list of integers +# representing the indexes of such declarations +def find_nested_template_declarations(inp : list[string], cutoff : int = 2) -> list[tuple[int,int]]: + out = [] + i = 0 + while i < len(inp): + line = inp[i] + ptr1 = i + ptr2 = i + num_brackets = len(re.findall(r'(?)>(?!>| )', inp[ptr1])) + num_braces = 0 + num_closing_braces = 0 + num_paren_open = inp[ptr1].count('(') + num_paren_close = inp[ptr1].count(')') + # stack for tracking bracket closure + + # bool for multiline template class invocation + is_template_decl = ( + (num_brackets > num_closing_brackets or + (num_brackets > 0 and ';' not in inp[ptr1]) or + num_paren_open > num_paren_close) and + ";" not in inp[ptr1] and + "include" not in inp[ptr1]) + while is_template_decl and ptr2 < len(inp): + #print(i, num_brackets, num_closing_brackets) + num_brackets += len(re.findall(r'(?") + num_braces += inp[ptr2].count("{") + num_closing_braces += inp[ptr2].count("}") + # scroll till we get through all the the brackets and the declaration + if ";" in inp[ptr2]: + #print("kword", ptr2, num_brackets) + if num_braces == num_closing_braces: + # print("kword", inp[ptr2], ptr2, num_brackets) + break + ptr2 += 1 + # count the depth of template parametrization + if num_brackets > cutoff and ptr1 != ptr2: + out.append((ptr1, ptr2)) + i = ptr2 + 1 + + return out + +def add_clang_format_comments(inp : list[string], insertions: list[tuple[int,int]]) -> list[string]: + clang_format_off = "// clang-format off\n" + clang_format_on = "// clang-format on\n" + num_insertions = 0 + current_insertion_idx = 0 + out = [] + for i in range(0, len(inp)): + done_inserting = current_insertion_idx >= len(insertions) + if not done_inserting and i == insertions[current_insertion_idx][0]: + out.append(clang_format_off) + out.append(inp[i]) + #print(i, insertions[current_insertion_idx][1] + 1) + if not done_inserting and i == insertions[current_insertion_idx][1] + 1: + out.append(clang_format_on) + # we've handled one off-on pair, increment pointer in insertion array + current_insertion_idx += 1 + + return out + +def refactor_file(fname, str): + with open(fname, "w") as f: + f.write(str) + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("-d", "--directory", dest="directory", help="directory to refactor") + args = parser.parse_args() + dir = args.directory + if args.directory is None or not os.path.exists(dir): + exit("please specify a valid directory to add clang format annotations") + for d, _, files in os.walk(dir): + for f in files: + out = "" + with open(os.path.abspath(os.path.join(d, f)), "r") as f_obj: + f_list = f_obj.readlines() + out = add_clang_format_comments(f_list, find_nested_template_declarations(f_list)) + with open(os.path.abspath(os.path.join(d, f)), "w") as f_obj: + for line in out: + f_obj.write(f"{line}") + +# + #test_str = """ + #/* + # Define CUDA/HIP matrix multiplication kernel for comparison to RAJA version + #*/ + ##if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + #__global__ void matMultKernel(int N, double* C, double* A, double* B) + #{ + # int row = blockIdx.y * blockDim.y + threadIdx.y; + # int col = blockIdx.x * blockDim.x + threadIdx.x; +# + # if ( row < N && col < N ) { + # double dot = 0.0; + # for (int k = 0; k < N; ++k) { + # dot += A(row, k) * B(k, col); + # } +# + # C(row, col) = dot; + # } + #} +# + #__global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) + #{ +# + # int Row = blockIdx.y*THREAD_SZ + threadIdx.y; + # int Col = blockIdx.x*THREAD_SZ + threadIdx.x; +# + # __shared__ double As[THREAD_SZ][THREAD_SZ]; + # __shared__ double Bs[THREAD_SZ][THREAD_SZ]; + # __shared__ double Cs[THREAD_SZ][THREAD_SZ]; +# + # Cs[threadIdx.y][threadIdx.x] = 0.0; +# + # for (int k = 0; k < (THREAD_SZ + N - 1)/THREAD_SZ; k++) { +# + # if ( static_cast(k*THREAD_SZ + threadIdx.x) < N && Row < N ) + # As[threadIdx.y][threadIdx.x] = A[Row*N + k*THREAD_SZ + threadIdx.x]; + # else + # As[threadIdx.y][threadIdx.x] = 0.0; +# + # if ( static_cast(k*THREAD_SZ + threadIdx.y) < N && Col < N) + # Bs[threadIdx.y][threadIdx.x] = B[(k*THREAD_SZ + threadIdx.y)*N + Col]; + # else + # Bs[threadIdx.y][threadIdx.x] = 0.0; +# + # __syncthreads(); +# + # for (int n = 0; n < THREAD_SZ; ++n) + # Cs[threadIdx.y][threadIdx.x] += As[threadIdx.y][n] * Bs[n][threadIdx.x]; +# + # __syncthreads(); + # } +# + # if (Row < N && Col < N) + # C[((blockIdx.y * blockDim.y + threadIdx.y)*N) + + # (blockIdx.x * blockDim.x)+ threadIdx.x] = Cs[threadIdx.y][threadIdx.x]; + #} + ##endif + #""" + #f_list = test_str.split('\n') + #"\n".join(add_clang_format_comments(f_list, find_nested_template_declarations(f_list))) \ No newline at end of file From 8ea1e156480cbb7bab8d8a8ff8c1c11a7ffe0591 Mon Sep 17 00:00:00 2001 From: john bowen Date: Tue, 8 Oct 2024 15:06:53 -0700 Subject: [PATCH 2/5] Add clang format annotations to the exercises directory --- exercises/atomic-histogram_solution.cpp | 14 ++++++++ exercises/dot-product.cpp | 2 ++ exercises/dot-product_solution.cpp | 8 +++++ .../kernel-matrix-transpose-local-array.cpp | 31 +++++++++++++++++ ...-matrix-transpose-local-array_solution.cpp | 31 +++++++++++++++++ exercises/kernel-matrix-transpose-tiled.cpp | 13 +++++++ ...kernel-matrix-transpose-tiled_solution.cpp | 17 ++++++++++ exercises/kernel-matrix-transpose.cpp | 7 ++++ .../kernel-matrix-transpose_solution.cpp | 13 +++++++ exercises/kernelintro-execpols.cpp | 26 ++++++++++++++ exercises/kernelintro-execpols_solution.cpp | 34 +++++++++++++++++++ exercises/kernelintro-nested-loop-reorder.cpp | 2 ++ ...rnelintro-nested-loop-reorder_solution.cpp | 6 ++++ .../launch-matrix-transpose-local-array.cpp | 15 ++++++++ ...-matrix-transpose-local-array_solution.cpp | 15 ++++++++ exercises/launch-matrix-transpose-tiled.cpp | 15 ++++++++ ...launch-matrix-transpose-tiled_solution.cpp | 15 ++++++++ exercises/launch-matrix-transpose.cpp | 11 ++++++ .../launch-matrix-transpose_solution.cpp | 13 +++++++ exercises/launchintro-execpols.cpp | 14 ++++++++ exercises/launchintro-execpols_solution.cpp | 14 ++++++++ exercises/memoryManager.hpp | 1 + exercises/offset-layout-stencil.cpp | 8 +++++ exercises/offset-layout-stencil_solution.cpp | 10 ++++++ .../permuted-layout-batch-matrix-multiply.cpp | 19 +++++++++++ ...-layout-batch-matrix-multiply_solution.cpp | 19 +++++++++++ exercises/scan.cpp | 3 ++ exercises/scan_solution.cpp | 25 ++++++++++++++ exercises/sort.cpp | 3 ++ exercises/sort_solution.cpp | 25 ++++++++++++++ .../tutorial_halfday/ex5_line-of-sight.cpp | 1 + .../ex5_line-of-sight_solution.cpp | 1 + .../ex6_stencil-offset-layout.cpp | 1 + .../ex6_stencil-offset-layout_solution.cpp | 3 ++ .../ex8_tiled-matrix-transpose.cpp | 15 ++++++++ .../ex8_tiled-matrix-transpose_solution.cpp | 15 ++++++++ .../ex9_matrix-transpose-local-array.cpp | 13 +++++++ ..._matrix-transpose-local-array_solution.cpp | 13 +++++++ exercises/tutorial_halfday/memoryManager.hpp | 1 + exercises/vector-addition.cpp | 2 ++ exercises/vector-addition_solution.cpp | 10 ++++++ exercises/vertexsum-indexset.cpp | 4 +++ exercises/vertexsum-indexset_solution.cpp | 4 +++ exercises/view-layout.cpp | 11 ++++++ exercises/view-layout_solution.cpp | 15 ++++++++ 45 files changed, 538 insertions(+) diff --git a/exercises/atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp index 368f729ebc..1e235dd025 100644 --- a/exercises/atomic-histogram_solution.cpp +++ b/exercises/atomic-histogram_solution.cpp @@ -128,12 +128,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::TypedRangeSegment array_range(0,N); // _range_atomic_histogram_end +// clang-format off RAJA::forall(array_range, [=](int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); +// clang-format on checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -149,12 +151,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); // _rajaomp_atomic_histogram_start +// clang-format off RAJA::forall(array_range, [=](int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajaomp_atomic_histogram_end +// clang-format on checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -173,12 +177,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); +// clang-format off RAJA::forall(array_range, [=](int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); +// clang-format on checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -196,12 +202,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); // _rajacuda_atomic_histogram_start +// clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajacuda_atomic_histogram_end +// clang-format on checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -221,12 +229,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); // _rajacuda_atomicauto_histogram_start +// clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajacuda_atomicauto_histogram_end +// clang-format on checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -244,12 +254,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); // _rajahip_atomic_histogram_start +// clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajahip_atomic_histogram_end +// clang-format on checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -269,12 +281,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); // _rajahip_atomicauto_histogram_start +// clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajahip_atomicauto_histogram_end +// clang-format on checkResult(hist, hist_ref, M); //printArray(hist, M); diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp index c2830c6cb2..0c6fba9b93 100644 --- a/exercises/dot-product.cpp +++ b/exercises/dot-product.cpp @@ -90,10 +90,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum seqdot(0.0); +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { seqdot += a[i] * b[i]; }); +// clang-format on dot = seqdot.get(); std::cout << "\t (a, b) = " << dot << std::endl; diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp index d0ae458171..c1d340935b 100644 --- a/exercises/dot-product_solution.cpp +++ b/exercises/dot-product_solution.cpp @@ -84,10 +84,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajaseq_dotprod_start RAJA::ReduceSum seqdot(0.0); +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { seqdot += a[i] * b[i]; }); +// clang-format on dot = seqdot.get(); // _rajaseq_dotprod_end @@ -132,11 +134,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajacuda_dotprod_start RAJA::ReduceSum cudot(0.0); +// clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { cudot += a[i] * b[i]; }); +// clang-format on dot = cudot.get(); // _rajacuda_dotprod_end @@ -164,11 +168,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajahip_dotprod_start RAJA::ReduceSum hpdot(0.0); +// clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { hpdot += d_a[i] * d_b[i]; }); +// clang-format on dot = hpdot.get(); // _rajahip_dotprod_end @@ -193,11 +199,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajasycl_dotprod_start RAJA::ReduceSum hpdot(0.0); +// clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { hpdot += a[i] * b[i]; }); +// clang-format on dot = static_cast(hpdot.get()); // _rajasycl_dotprod_end diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp index 227af7d2be..301cea4f07 100644 --- a/exercises/kernel-matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -56,15 +56,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -207,6 +211,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /* using SEQ_EXEC_POL_I = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -235,6 +240,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -251,6 +258,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); */ +// clang-format on // _mattranspose_localarray_raja_end checkResult(Atview, N_c, N_r); @@ -271,6 +279,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /* using OPENMP_EXEC_1_POL = +// clang-format off RAJA::KernelPolicy< // // (0) Execution policies for outer loops @@ -315,6 +324,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -333,6 +344,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); */ +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -344,6 +356,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_2_POL = +// clang-format off RAJA::KernelPolicy< // // (0) Execution policies for outer loops @@ -385,6 +398,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -403,6 +418,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_r, N_c); #endif @@ -414,6 +430,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using CUDA_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< // @@ -463,7 +480,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -482,6 +501,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -509,6 +529,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); using HIP_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< // @@ -558,7 +579,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -577,6 +600,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -601,6 +625,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /* using SEQ_EXEC_POL_II = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -624,6 +649,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -639,6 +666,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); */ +// clang-format on // _mattranspose_localarray_raja_lambdaargs_end checkResult(Atview, N_c, N_r); @@ -653,6 +681,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -671,9 +700,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index 7b44cd3453..b1b02a6107 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -56,15 +56,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -201,6 +205,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _mattranspose_localarray_raja_start using SEQ_EXEC_POL_I = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -224,6 +229,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -240,6 +247,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); // _mattranspose_localarray_raja_end +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -253,6 +261,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_1_POL = +// clang-format off RAJA::KernelPolicy< // // (0) Execution policies for outer loops @@ -294,6 +303,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -312,6 +323,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -322,6 +334,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_2_POL = +// clang-format off RAJA::KernelPolicy< // // (0) Execution policies for outer loops @@ -363,6 +376,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -381,6 +396,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_r, N_c); #endif @@ -392,6 +408,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using CUDA_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< // @@ -441,7 +458,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -460,6 +479,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -487,6 +507,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); using HIP_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< // @@ -536,7 +557,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -555,6 +578,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); +// clang-format on hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -573,6 +597,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_mattranspose_lambdaargs_start using SEQ_EXEC_POL_II = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -596,6 +621,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -611,6 +638,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_mattranspose_lambdaargs_start +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -623,6 +651,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -641,9 +670,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp index 7316563117..c96daa700e 100644 --- a/exercises/kernel-matrix-transpose-tiled.cpp +++ b/exercises/kernel-matrix-transpose-tiled.cpp @@ -46,15 +46,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -169,6 +173,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// using TILED_KERNEL_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -181,6 +186,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -234,6 +240,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to/from the tile. // using TILED_KERNEL_EXEC_POL_OMP2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -245,6 +252,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 1 >; // closes policy list +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -304,6 +312,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); using TILED_KERNEL_EXEC_POL_HIP = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, @@ -318,6 +327,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE (int col, int row) { d_Atview(col, row) = d_Aview(row, col); @@ -345,6 +355,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -363,9 +374,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/kernel-matrix-transpose-tiled_solution.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp index 9124a1b174..66bf54c505 100644 --- a/exercises/kernel-matrix-transpose-tiled_solution.cpp +++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp @@ -46,15 +46,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -159,6 +163,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _raja_tiled_mattranspose_start using TILED_KERNEL_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -171,6 +176,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -191,6 +197,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // one of the inner loops. // using TILED_KERNEL_EXEC_POL_OMP = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -203,6 +210,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -222,6 +230,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to/from the tile. // using TILED_KERNEL_EXEC_POL_OMP2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -233,6 +242,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 1 >; // closes policy list +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -252,6 +262,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_mattranspose_cuda_start using TILED_KERNEL_EXEC_POL_CUDA = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, @@ -266,6 +277,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE (int col, int row) { Atview(col, row) = Aview(row, col); @@ -292,6 +304,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); using TILED_KERNEL_EXEC_POL_HIP = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, @@ -306,6 +319,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE (int col, int row) { d_Atview(col, row) = d_Aview(row, col); @@ -333,6 +347,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -351,9 +366,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/kernel-matrix-transpose.cpp b/exercises/kernel-matrix-transpose.cpp index 04f71bf7e0..32c8664e56 100644 --- a/exercises/kernel-matrix-transpose.cpp +++ b/exercises/kernel-matrix-transpose.cpp @@ -34,15 +34,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -235,6 +239,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -253,9 +258,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp index 4dab678520..e988bc7784 100644 --- a/exercises/kernel-matrix-transpose_solution.cpp +++ b/exercises/kernel-matrix-transpose_solution.cpp @@ -34,15 +34,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -122,6 +126,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _raja_mattranspose_start using KERNEL_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -130,6 +135,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -149,6 +155,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // one of the inner loops. // using KERNEL_EXEC_POL_OMP = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -157,6 +164,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -174,6 +182,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_mattranspose_cuda_start using KERNEL_EXEC_POL_CUDA = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_thread_x_loop, @@ -184,6 +193,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE (int col, int row) { Atview(col, row) = Aview(row, col); @@ -210,6 +220,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -228,9 +239,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/kernelintro-execpols.cpp b/exercises/kernelintro-execpols.cpp index fdffc21ca9..0b93690a03 100644 --- a/exercises/kernelintro-execpols.cpp +++ b/exercises/kernelintro-execpols.cpp @@ -166,6 +166,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_omp_outer_start using EXEC_POL2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -176,6 +177,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -186,6 +189,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_outer_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -218,6 +222,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_omp_collapse_start using EXEC_POL3 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j, i @@ -225,6 +230,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -235,6 +242,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_collapse_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -273,6 +281,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_cuda_start using EXEC_POL5 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k @@ -285,6 +294,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -295,6 +306,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -317,6 +329,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_cuda_tiled_direct_start using EXEC_POL6 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -335,6 +348,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -345,6 +360,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_tiled_direct_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -360,10 +376,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); +// clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); +// clang-format on nested_init <<>>(a, c, N); cudaErrchk( cudaGetLastError() ); @@ -395,6 +413,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_hip_start using EXEC_POL7 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k @@ -407,6 +426,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -417,6 +438,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); @@ -439,6 +461,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_hip_tiled_direct_start using EXEC_POL8 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -457,6 +480,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -467,6 +492,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_tiled_direct_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp index c5041e01a9..e74ec473cb 100644 --- a/exercises/kernelintro-execpols_solution.cpp +++ b/exercises/kernelintro-execpols_solution.cpp @@ -124,6 +124,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_seq_start using EXEC_POL1 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::seq_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -134,6 +135,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -144,6 +147,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_seq_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -180,6 +184,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_omp_outer_start using EXEC_POL2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -190,6 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -200,6 +207,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_outer_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -232,6 +240,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_omp_collapse_start using EXEC_POL3 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j, i @@ -239,6 +248,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -249,6 +260,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_collapse_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -261,6 +273,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_omp_collapse_start using EXEC_POL4 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j @@ -270,6 +283,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -280,6 +295,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_collapse_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -299,6 +315,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_cuda_start using EXEC_POL5 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k @@ -311,6 +328,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -321,6 +340,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -343,6 +363,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_cuda_tiled_direct_start using EXEC_POL6 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -361,6 +382,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -371,6 +394,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_tiled_direct_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -386,10 +410,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); +// clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); +// clang-format on nested_init <<>>(a, c, N); cudaErrchk( cudaGetLastError() ); @@ -421,6 +447,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_hip_start using EXEC_POL7 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k @@ -433,6 +460,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -443,6 +472,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); @@ -465,6 +495,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_tensorinit_hip_tiled_direct_start using EXEC_POL8 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -483,6 +514,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on +// clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -493,6 +526,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_tiled_direct_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp index 406ea7e581..2d0736ac07 100644 --- a/exercises/kernelintro-nested-loop-reorder.cpp +++ b/exercises/kernelintro-nested-loop-reorder.cpp @@ -86,6 +86,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) << "...\n\n" << " (I, J, K)\n" << " ---------\n"; // _raja_kji_loops_start +// clang-format off using KJI_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::seq_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -96,6 +97,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), [=] (IIDX i, JIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp index 9df3ff4657..ff55d11f07 100644 --- a/exercises/kernelintro-nested-loop-reorder_solution.cpp +++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp @@ -86,6 +86,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) << "...\n\n" << " (I, J, K)\n" << " ---------\n"; // _raja_kji_loops_start +// clang-format off using KJI_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::seq_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -96,6 +97,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), [=] (IIDX i, JIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); @@ -123,6 +125,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) << "...\n\n" << " (I, J, K)\n" << " ---------\n"; // _raja_jik_loops_start +// clang-format off using JIK_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // j RAJA::statement::For<0, RAJA::seq_exec, // i @@ -133,6 +136,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), [=] (IIDX i, JIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); @@ -161,6 +165,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) << "...\n\n" << " (I, J, K)\n" << " ---------\n"; // _raja_ikj_loops_start +// clang-format off using IKJ_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::seq_exec, // i RAJA::statement::For<2, RAJA::seq_exec, // k @@ -171,6 +176,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), [=] (IIDX i, JIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp index eea48d073a..582b18ce80 100644 --- a/exercises/launch-matrix-transpose-local-array.cpp +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -55,15 +55,19 @@ const int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -178,6 +182,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -208,6 +213,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _mattranspose_localarray_raja_end +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -234,6 +240,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //using loop_pol_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -265,6 +272,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -288,6 +296,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool cuda_async = false; using cuda_launch_policy = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -320,6 +329,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -360,6 +370,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -392,6 +403,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -406,6 +418,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -424,9 +437,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index fe2d41ecec..bb777cd729 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -55,15 +55,19 @@ const int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -178,6 +182,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -209,6 +214,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _mattranspose_localarray_raja_end +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -228,6 +234,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_pol_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -259,6 +266,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -283,6 +291,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool cuda_async = false; using cuda_launch_policy = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -315,6 +324,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -355,6 +365,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -387,6 +398,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -401,6 +413,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -419,9 +432,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp index 1206cbc680..7f3e4f377b 100644 --- a/exercises/launch-matrix-transpose-tiled.cpp +++ b/exercises/launch-matrix-transpose-tiled.cpp @@ -46,15 +46,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -168,6 +172,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -198,6 +203,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); // _raja_tiled_mattranspose_end +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -225,6 +231,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /* +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -246,6 +253,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); */ +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -282,6 +290,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /* +// clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -304,6 +313,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); */ +// clang-format on checkResult(Atview, N_c, N_r); //printResult(Atview, N_c, N_r); @@ -341,6 +351,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -363,6 +374,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); checkResult(Atview, N_c, N_r); //printResult(Atview, N_c, N_r); @@ -385,6 +397,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -403,9 +416,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp index 646040f6f0..fb296721a6 100644 --- a/exercises/launch-matrix-transpose-tiled_solution.cpp +++ b/exercises/launch-matrix-transpose-tiled_solution.cpp @@ -46,15 +46,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -162,6 +166,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch(RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -182,6 +187,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tiled_mattranspose_end +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -200,6 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_pol_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -221,6 +228,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -248,6 +256,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool cuda_async = false; using cuda_launch_policy = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -270,6 +279,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_cuda_end +// clang-format on checkResult(Atview, N_c, N_r); //printResult(Atview, N_c, N_r); @@ -304,6 +314,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -326,6 +337,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); checkResult(Atview, N_c, N_r); //printResult(Atview, N_c, N_r); @@ -348,6 +360,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -366,9 +379,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp index 7cd96429bb..cf504fe0ff 100644 --- a/exercises/launch-matrix-transpose.cpp +++ b/exercises/launch-matrix-transpose.cpp @@ -34,15 +34,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -124,6 +128,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_policy_seq = RAJA::LoopPolicy; using launch_policy_seq = RAJA::LaunchPolicy; +// clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -141,6 +146,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_end +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -190,6 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async = false; //execute asynchronously using launch_policy_cuda = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -204,6 +211,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_cuda_end +// clang-format on checkResult(Atview, N_c, N_r); //printResult(Atview, N_c, N_r); @@ -225,6 +233,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -243,9 +252,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp index a7822bc1c7..05ef616500 100644 --- a/exercises/launch-matrix-transpose_solution.cpp +++ b/exercises/launch-matrix-transpose_solution.cpp @@ -34,15 +34,19 @@ constexpr int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -124,6 +128,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_policy_seq = RAJA::LoopPolicy; using launch_policy_seq = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -138,6 +143,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_end +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -154,6 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_policy_omp = RAJA::LoopPolicy; using launch_policy_omp = RAJA::LaunchPolicy; +// clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -168,6 +175,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); +// clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -185,6 +193,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async = false; //execute asynchronously using launch_policy_cuda = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -199,6 +208,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_cuda_end +// clang-format on checkResult(Atview, N_c, N_r); //printResult(Atview, N_c, N_r); @@ -220,6 +230,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -238,9 +249,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp index 10c2b0e302..e5dc7b4f70 100644 --- a/exercises/launchintro-execpols.cpp +++ b/exercises/launchintro-execpols.cpp @@ -133,6 +133,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //using loop_policy_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -145,6 +146,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); // _raja_tensorinit_seq_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -193,6 +195,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ using launch_policy_2 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -210,6 +213,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tensorinit_omp_outer_end +// clang-format on checkResult(a, a_ref, N_tot); #endif @@ -248,6 +252,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_3 = false; using launch_policy_3 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -264,6 +269,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); +// clang-format on // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); @@ -286,6 +292,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_4 = false; using launch_policy_4 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -313,6 +320,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_cuda_tiled_direct_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -328,10 +336,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); +// clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); +// clang-format on nested_init <<>>(a, c, N); cudaErrchk( cudaGetLastError() ); @@ -381,6 +391,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_5 = false; using launch_policy_5 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -398,6 +409,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tensorinit_hip_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); @@ -421,6 +433,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_6 = false; using launch_policy_6 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -448,6 +461,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_hip_tiled_direct_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp index 1bfff68acf..2439988b06 100644 --- a/exercises/launchintro-execpols_solution.cpp +++ b/exercises/launchintro-execpols_solution.cpp @@ -126,6 +126,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_policy_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -141,6 +142,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_seq_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -180,6 +182,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_policy_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; +// clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -195,6 +198,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_omp_outer_end +// clang-format on checkResult(a, a_ref, N_tot); #endif @@ -233,6 +237,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_3 = false; using launch_policy_3 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -249,6 +254,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); +// clang-format on // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); @@ -271,6 +277,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_4 = false; using launch_policy_4 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -298,6 +305,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_cuda_tiled_direct_end +// clang-format on checkResult(a, a_ref, N_tot); @@ -313,10 +321,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); +// clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); +// clang-format on nested_init <<>>(a, c, N); cudaErrchk( cudaGetLastError() ); @@ -366,6 +376,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_5 = false; using launch_policy_5 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -383,6 +394,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tensorinit_hip_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); @@ -406,6 +418,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_6 = false; using launch_policy_6 = RAJA::LaunchPolicy>; +// clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -433,6 +446,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_hip_tiled_direct_end +// clang-format on hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); checkResult(a, a_ref, N_tot); diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp index 62d3d6e3e7..960142a83b 100644 --- a/exercises/memoryManager.hpp +++ b/exercises/memoryManager.hpp @@ -31,6 +31,7 @@ namespace memoryManager static camp::resources::Resource* sycl_res; #endif +// clang-format off template T *allocate(RAJA::Index_type size) { diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp index 3432adbb50..058308de74 100644 --- a/exercises/offset-layout-stencil.cpp +++ b/exercises/offset-layout-stencil.cpp @@ -190,9 +190,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_views_start const int DIM = 2; +// clang-format off RAJA::OffsetLayout layout = RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); +// clang-format on RAJA::View> inputView(input, layout); RAJA::View> outputView(output, layout); // _offsetlayout_views_end @@ -213,6 +215,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajaseq_start using NESTED_EXEC_POL1 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col @@ -221,6 +224,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -271,6 +275,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajacuda_start using NESTED_EXEC_POL3 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row @@ -281,6 +286,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { @@ -316,6 +322,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajahip_start using NESTED_EXEC_POL4 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, //row @@ -326,6 +333,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp index f212ca7630..79b056bafd 100644 --- a/exercises/offset-layout-stencil_solution.cpp +++ b/exercises/offset-layout-stencil_solution.cpp @@ -191,9 +191,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_views_start const int DIM = 2; +// clang-format off RAJA::OffsetLayout layout = RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); +// clang-format on RAJA::View> inputView(input, layout); RAJA::View> outputView(output, layout); // _offsetlayout_views_end @@ -214,6 +216,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajaseq_start using NESTED_EXEC_POL1 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col @@ -222,6 +225,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -249,6 +253,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajaomp_start using NESTED_EXEC_POL2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col @@ -256,6 +261,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -284,6 +290,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajacuda_start using NESTED_EXEC_POL3 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row @@ -294,6 +301,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { @@ -332,6 +340,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajahip_start using NESTED_EXEC_POL4 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, //row @@ -342,6 +351,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp index 2fb9d7ac56..2ba290d27d 100644 --- a/exercises/permuted-layout-batch-matrix-multiply.cpp +++ b/exercises/permuted-layout-batch-matrix-multiply.cpp @@ -179,6 +179,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using INIT_POL = RAJA::seq_exec; #endif +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { for (int row = 0; row < N_r; ++row) { for (int col = 0; col < N_c; ++col) { @@ -193,6 +194,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } }); +// clang-format on //----------------------------------------------------------------------------// @@ -204,6 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); // _permutedlayout_batchedmatmult_loop_start +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -239,6 +242,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_loop_end +// clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -260,6 +264,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { // _permutedlayout2_batchedmatmult_loop_start +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -296,6 +301,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout2_batchedmatmult_loop_end +// clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -320,6 +326,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); // _permutedlayout_batchedmatmult_omp_start +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -356,6 +363,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_omp_end +// clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -378,6 +386,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -414,6 +423,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -439,6 +449,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -475,6 +486,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -496,6 +508,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -532,6 +545,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -564,6 +578,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -600,6 +615,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -639,6 +655,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -675,6 +692,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -714,6 +732,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // check result // +// clang-format off template void checkResult(T C, int nMat, int nRows, int nCols) { diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp index 297ec45047..9d2f707907 100644 --- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp +++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp @@ -169,6 +169,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using INIT_POL = RAJA::seq_exec; #endif +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { for (int row = 0; row < N_r; ++row) { for (int col = 0; col < N_c; ++col) { @@ -183,6 +184,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } }); +// clang-format on //----------------------------------------------------------------------------// @@ -194,6 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); // _permutedlayout_batchedmatmult_loop_start +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -229,6 +232,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_loop_end +// clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -249,6 +253,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); // _permutedlayout2_batchedmatmult_loop_start +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -285,6 +290,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout2_batchedmatmult_loop_end +// clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -308,6 +314,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); // _permutedlayout_batchedmatmult_omp_start +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -344,6 +351,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_omp_end +// clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -365,6 +373,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -401,6 +410,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -425,6 +435,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -461,6 +472,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -481,6 +493,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -517,6 +530,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -558,6 +572,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -594,6 +609,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -614,6 +630,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -650,6 +667,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); +// clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; @@ -688,6 +706,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // check result // +// clang-format off template void checkResult(T C, int nMat, int nRows, int nCols) { diff --git a/exercises/scan.cpp b/exercises/scan.cpp index 68f52fce2b..c0c8ef7182 100644 --- a/exercises/scan.cpp +++ b/exercises/scan.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// clang-format off #define OP_PLUS_INT RAJA::operators::plus #define OP_MIN_INT RAJA::operators::minimum #define OP_MAX_INT RAJA::operators::maximum #define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult(in, out, N); #define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult(in, out, N); +// clang-format on #include #include @@ -364,6 +366,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Function to check inclusive scan result // +// clang-format off template void checkInclusiveScanResult(const T* in, const T* out, int N) { diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp index 7ed7101192..7338260c1e 100644 --- a/exercises/scan_solution.cpp +++ b/exercises/scan_solution.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// clang-format off #define OP_PLUS_INT RAJA::operators::plus #define OP_MIN_INT RAJA::operators::minimum #define OP_MAX_INT RAJA::operators::maximum #define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult(in, out, N); #define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult(in, out, N); +// clang-format on #include #include @@ -109,10 +111,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_inclusive_seq_plus_start +// clang-format off RAJA::inclusive_scan(RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_inclusive_seq_plus_end +// clang-format on CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -125,10 +129,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_exclusive_seq_plus_start +// clang-format off RAJA::exclusive_scan(RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_seq_plus_end +// clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -141,9 +147,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_inclusive_inplace_seq_min_start std::copy_n(in, N, out); +// clang-format off RAJA::inclusive_scan_inplace(RAJA::make_span(out, N), RAJA::operators::minimum{}); // _scan_inclusive_inplace_seq_min_end +// clang-format on CHECK_INC_SCAN_RESULTS(OP_MIN_INT) printArray(out, N); @@ -156,9 +164,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_exclusive_inplace_seq_max_start +// clang-format off RAJA::exclusive_scan_inplace(RAJA::make_span(out, N), RAJA::operators::maximum{}); // _scan_exclusive_inplace_seq_max_end +// clang-format on CHECK_EXC_SCAN_RESULTS(OP_MAX_INT) printArray(out, N); @@ -174,10 +184,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running OpenMP inclusive_scan (plus)...\n"; // _scan_inclusive_omp_plus_start +// clang-format off RAJA::inclusive_scan(RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_inclusive_omp_plus_end +// clang-format on CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -190,10 +202,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_exclusive_inplace_omp_plus_start +// clang-format off RAJA::exclusive_scan_inplace( RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_inplace_omp_plus_end +// clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -214,10 +228,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_inclusive_inplace_cuda_plus_start +// clang-format off RAJA::inclusive_scan_inplace>( RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_inclusive_inplace_cuda_plus_end +// clang-format on CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -230,10 +246,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_exclusive_inplace_cuda_plus_start +// clang-format off RAJA::exclusive_scan_inplace>( RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_inplace_cuda_plus_end +// clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -246,11 +264,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_exclusive_cuda_plus_start +// clang-format off RAJA::exclusive_scan>( RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_cuda_plus_end +// clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -276,10 +296,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); // _scan_inclusive_inplace_hip_plus_start +// clang-format off RAJA::inclusive_scan_inplace>( RAJA::make_span(d_out, N), RAJA::operators::plus{}); // _scan_inclusive_inplace_hip_plus_end +// clang-format on hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); @@ -294,11 +316,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); +// clang-format off RAJA::exclusive_scan>( RAJA::make_span(d_in, N), RAJA::make_span(d_out, N), RAJA::operators::plus{}); +// clang-format on hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) @@ -327,6 +351,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Function to check inclusive scan result // +// clang-format off template void checkInclusiveScanResult(const T* in, const T* out, int N) { diff --git a/exercises/sort.cpp b/exercises/sort.cpp index 21a5fb5edd..9244ed199f 100644 --- a/exercises/sort.cpp +++ b/exercises/sort.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// clang-format off #define OP_GREATER RAJA::operators::greater #define OP_LESS RAJA::operators::less @@ -49,6 +50,7 @@ #if defined(RAJA_ENABLE_CUDA) //constexpr int CUDA_BLOCK_SIZE = 16; #endif +// clang-format on #if defined(RAJA_ENABLE_HIP) //constexpr int HIP_BLOCK_SIZE = 16; @@ -410,6 +412,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) return 0; } +// clang-format off template bool equivalent(T const& a, T const& b, Comparator comp) { diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp index 98f65c6dbe..a82dd5d4a5 100644 --- a/exercises/sort_solution.cpp +++ b/exercises/sort_solution.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// clang-format off #define OP_GREATER RAJA::operators::greater #define OP_LESS RAJA::operators::less @@ -49,6 +50,7 @@ #if defined(RAJA_ENABLE_CUDA) constexpr int CUDA_BLOCK_SIZE = 16; #endif +// clang-format on #if defined(RAJA_ENABLE_HIP) constexpr int HIP_BLOCK_SIZE = 16; @@ -137,9 +139,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _sort_seq_less_start +// clang-format off RAJA::sort(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_seq_less_end +// clang-format on //checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); @@ -153,9 +157,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _sort_stable_seq_less_start +// clang-format off RAJA::stable_sort(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_stable_seq_less_end +// clang-format on //checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); @@ -169,9 +175,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _sort_stable_seq_greater_start +// clang-format off RAJA::stable_sort(RAJA::make_span(out, N), RAJA::operators::greater{}); // _sort_stable_seq_greater_end +// clang-format on //checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); @@ -186,10 +194,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_pairs_seq_less_start +// clang-format off RAJA::sort_pairs(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::less{}); // _sort_pairs_seq_less_end +// clang-format on //checkUnstableSortResult>(in, out, in_vals, out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); @@ -204,10 +214,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_stable_pairs_seq_greater_start +// clang-format off RAJA::stable_sort_pairs(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::greater{}); // _sort_stable_pairs_seq_greater_end +// clang-format on //checkStableSortResult>(in, out, in_vals, out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); @@ -226,9 +238,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _sort_omp_less_start +// clang-format off RAJA::sort(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_omp_less_end +// clang-format on //checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); @@ -243,10 +257,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_stable_pairs_omp_greater_start +// clang-format off RAJA::stable_sort_pairs(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::greater{}); // _sort_stable_pairs_omp_greater_end +// clang-format on //checkStableSortResult>(in, out, in_vals, out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); @@ -269,10 +285,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_pairs_cuda_greater_start +// clang-format off RAJA::sort_pairs>(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::greater{}); // _sort_pairs_cuda_greater_end +// clang-format on //checkUnstableSortResult>(in, out, in_vals, out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER); @@ -286,9 +304,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _sort_stable_cuda_less_start +// clang-format off RAJA::stable_sort>(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_stable_cuda_less_end +// clang-format on //checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); @@ -316,10 +336,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice )); +// clang-format off RAJA::sort_pairs>(RAJA::make_span(d_out, N), RAJA::make_span(d_out_vals, N), RAJA::operators::less{}); +// clang-format on hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); @@ -337,10 +359,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); // _sort_stable_hip_greater_start +// clang-format off RAJA::stable_sort>( RAJA::make_span(d_out, N), RAJA::operators::greater{}); // _sort_stable_hip_greater_end +// clang-format on hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); @@ -371,6 +395,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) return 0; } +// clang-format off template bool equivalent(T const& a, T const& b, Comparator comp) { diff --git a/exercises/tutorial_halfday/ex5_line-of-sight.cpp b/exercises/tutorial_halfday/ex5_line-of-sight.cpp index c17fb2eb8a..c94e8a4132 100644 --- a/exercises/tutorial_halfday/ex5_line-of-sight.cpp +++ b/exercises/tutorial_halfday/ex5_line-of-sight.cpp @@ -264,6 +264,7 @@ int checkResult(int* visible, int* visible_ref, int len) // // Function to print array. // +// clang-format off template void printArray(T* v, int len) { diff --git a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp index 12348816a1..d5d242da86 100644 --- a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp +++ b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp @@ -271,6 +271,7 @@ int checkResult(int* visible, int* visible_ref, int len) // // Function to print array. // +// clang-format off template void printArray(T* v, int len) { diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp index 4d29f7b3ae..2362292b30 100644 --- a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp +++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp @@ -322,6 +322,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // For array printing, 'stride1dim' indicates which mesh dimenstride is +// clang-format off // stride-1 (0 indicates each row is stride-1, // 1 indicates each column is stride-1). // diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp index 51aad20dae..1efd533eee 100644 --- a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp +++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp @@ -208,9 +208,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // we don't need a permutation for this case. // +// clang-format off RAJA::OffsetLayout B_layout = RAJA::make_offset_layout({{-1, -1}}, {{Nc_tot-1, Nr_tot-1}}); +// clang-format on RAJA::View> Bview(B, B_layout); RAJA::View> Aview(A, Nc_int, Nr_int); @@ -329,6 +331,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // For array printing, 'stride1dim' indicates which mesh dimenstride is +// clang-format off // stride-1 (0 indicates each row is stride-1, // 1 indicates each column is stride-1). // diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp index d183c221fa..2a568de38a 100644 --- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp +++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp @@ -43,15 +43,19 @@ const int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -170,6 +174,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // #if 0 using KERNEL_EXEC_POL_SEQ = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -183,6 +188,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; #endif +// clang-format on /// /// TODO... @@ -209,6 +215,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // #if 0 using KERNEL_EXEC_POL_OMP = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -222,6 +229,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; #endif +// clang-format on /// /// TODO... @@ -250,6 +258,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to/from the tile. // using KERNEL_EXEC_POL_OMP2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -263,6 +272,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 1 >; // closes policy list +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { @@ -283,6 +293,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if 0 using KERNEL_EXEC_POL_CUDA = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -298,6 +309,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; #endif +// clang-format on /// /// TODO... @@ -330,6 +342,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -348,9 +361,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp index dbb9a75c20..b3c8fac085 100644 --- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp +++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp @@ -41,15 +41,19 @@ const int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -163,6 +167,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_SEQ = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -177,6 +182,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); @@ -197,6 +203,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_OMP = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -211,6 +218,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { @@ -235,6 +243,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_OMP2 = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -248,6 +257,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 1 >; // closes policy list +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { @@ -267,6 +277,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using KERNEL_EXEC_POL_CUDA = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -283,6 +294,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE (int col, int row) { @@ -310,6 +322,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -328,9 +341,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp index 71743ba2d4..000dbf5db1 100644 --- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp +++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp @@ -47,15 +47,19 @@ const int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -205,6 +209,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if 0 using SEQ_EXEC_POL = +// clang-format off RAJA::KernelPolicy< // Fill in sequential outer loop tiling execution statements.... // (sequential outer row loop, sequential inner column loop)... @@ -232,6 +237,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on /// /// TODO... /// @@ -272,6 +278,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if 0 using OPENMP_EXEC_POL = +// clang-format off RAJA::KernelPolicy< // Fill in the outer loop tiling execttion statements // (OpenMP outer row loop, sequential inner column loop)... @@ -298,6 +305,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on /// /// TODO... /// @@ -340,6 +348,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if 0 using CUDA_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< // Fill in the outer loop tiling execttion statements @@ -372,6 +381,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on /// /// TODO... /// @@ -421,6 +431,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -439,9 +450,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp index 1900bf1157..2f122a6f12 100644 --- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp +++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp @@ -47,15 +47,19 @@ const int DIM = 2; // // Function for checking results // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on // // Function for printing results // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); +// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -202,6 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using SEQ_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -231,6 +236,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), RAJA::make_tuple((int)0, (int)0, RAJA_Tile), @@ -258,6 +264,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, @@ -286,6 +293,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), RAJA::make_tuple((int)0, (int)0, RAJA_Tile), @@ -315,6 +323,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using CUDA_EXEC_POL = +// clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -349,6 +358,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; +// clang-format on RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), @@ -387,6 +397,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -405,9 +416,11 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; +// clang-format on // // Function to print result. // +// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/exercises/tutorial_halfday/memoryManager.hpp b/exercises/tutorial_halfday/memoryManager.hpp index 83fb8cb3bb..82c64bcf2f 100644 --- a/exercises/tutorial_halfday/memoryManager.hpp +++ b/exercises/tutorial_halfday/memoryManager.hpp @@ -27,6 +27,7 @@ namespace memoryManager { +// clang-format off template T *allocate(RAJA::Index_type size) { diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp index dbe5260f6d..7b9b36c2d9 100644 --- a/exercises/vector-addition.cpp +++ b/exercises/vector-addition.cpp @@ -116,10 +116,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// // _rajaseq_vector_add_start +// clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; }); // _rajaseq_vector_add_end +// clang-format on checkResult(c, c_ref, N); //printArray(c, N); diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp index 3bbc070731..9bc491ad0c 100644 --- a/exercises/vector-addition_solution.cpp +++ b/exercises/vector-addition_solution.cpp @@ -127,12 +127,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA SIMD vector addition...\n"; +// clang-format off RAJA::forall( RAJA::TypedRangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; } ); +// clang-format on checkResult(c, c_ref, N); //printArray(c, N); @@ -200,11 +202,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice )); // _rajacuda_vector_add_start +// clang-format off RAJA::forall< RAJA::cuda_exec >(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajacuda_vector_add_end +// clang-format on cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); @@ -223,11 +227,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajacuda_explicit_vector_add_start const bool Asynchronous = true; +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajacuda_explicit_vector_add_end +// clang-format on cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); @@ -250,11 +256,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); // _rajahip_vector_add_start +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajahip_vector_add_end +// clang-format on hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); @@ -281,11 +289,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); // _rajasycl_vector_add_start +// clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajasycl_vector_add_end +// clang-format on memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp index 258250a741..5763e5b48f 100644 --- a/exercises/vertexsum-indexset.cpp +++ b/exercises/vertexsum-indexset.cpp @@ -324,9 +324,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(areav, 0, Nvert*Nvert * sizeof(double)); // _raja_vertexarea_cuda_start +// clang-format off using EXEC_POL2 = RAJA::ExecPolicy>; +// clang-format on RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { int* iv = &(e2v_map[4*ie]); areav[ iv[0] ] += areae[ie] / 4.0 ; @@ -383,9 +385,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA HIP index set vertex sum...\n"; // _raja_vertexarea_hip_start +// clang-format off using EXEC_POL3 = RAJA::ExecPolicy>; +// clang-format on RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { int* iv = &(d_e2v_map[4*ie]); d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp index 5c1617343a..d81fc9487d 100644 --- a/exercises/vertexsum-indexset_solution.cpp +++ b/exercises/vertexsum-indexset_solution.cpp @@ -316,9 +316,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(areav, 0, Nvert*Nvert * sizeof(double)); // _raja_vertexarea_cuda_start +// clang-format off using EXEC_POL2 = RAJA::ExecPolicy>; +// clang-format on RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { int* iv = &(e2v_map[4*ie]); areav[ iv[0] ] += areae[ie] / 4.0 ; @@ -375,9 +377,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA HIP index set vertex sum...\n"; // _raja_vertexarea_hip_start +// clang-format off using EXEC_POL3 = RAJA::ExecPolicy>; +// clang-format on RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { int* iv = &(d_e2v_map[4*ie]); d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp index 0f9383e95e..5b9601a21e 100644 --- a/exercises/view-layout.cpp +++ b/exercises/view-layout.cpp @@ -479,12 +479,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout1D_start +// clang-format off RAJA::OffsetLayout<1, int> offlayout_1D = RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); +// clang-format on +// clang-format off RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, offlayout_1D); +// clang-format on for (int i = imin; i < imax; ++i) { aoview_1Doff(i) = i; } @@ -563,14 +567,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_permofflayout2D_start std::array perm1D {{1, 0}}; +// clang-format off RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, {{imax, jmax}}, perm1D ); +// clang-format on +// clang-format off RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, permofflayout_2D); +// clang-format on iter = 0; for (int j = jmin; j < jmax; ++j) { for (int i = imin; i < imax; ++i) { @@ -600,6 +608,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(T* C, T* Cref, int N) { @@ -616,6 +625,8 @@ void checkResult(T* C, T* Cref, int N) } }; +// clang-format on +// clang-format off template void printValues(T* C, int N) { diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp index 7614c993a8..26316bc2c7 100644 --- a/exercises/view-layout_solution.cpp +++ b/exercises/view-layout_solution.cpp @@ -490,12 +490,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout1D_start +// clang-format off RAJA::OffsetLayout<1, int> offlayout_1D = RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); +// clang-format on +// clang-format off RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, offlayout_1D); +// clang-format on for (int i = imin; i < imax; ++i) { aoview_1Doff(i) = i; } @@ -536,12 +540,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout2D_start +// clang-format off RAJA::OffsetLayout<2, int> offlayout_2D = RAJA::make_offset_layout<2, int>( {{imin, jmin}}, {{imax, jmax}} ); +// clang-format on +// clang-format off RAJA::View< int, RAJA::OffsetLayout<2, int> > aoview_2Doff(ao, offlayout_2D); iter = 0; +// clang-format on for (int i = imin; i < imax; ++i) { for (int j = jmin; j < jmax; ++j) { aoview_2Doff(i, j) = iter; @@ -581,14 +589,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_permofflayout2D_start std::array perm1D {{1, 0}}; +// clang-format off RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, {{imax, jmax}}, perm1D ); +// clang-format on +// clang-format off RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, permofflayout_2D); +// clang-format on iter = 0; for (int j = jmin; j < jmax; ++j) { for (int i = imin; i < imax; ++i) { @@ -618,6 +630,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // +// clang-format off template void checkResult(T* C, T* Cref, int N) { @@ -634,6 +647,8 @@ void checkResult(T* C, T* Cref, int N) } }; +// clang-format on +// clang-format off template void printValues(T* C, int N) { From 48253521df89597bebce2fdbea3d235e6848ebb3 Mon Sep 17 00:00:00 2001 From: john bowen Date: Tue, 8 Oct 2024 15:17:50 -0700 Subject: [PATCH 3/5] tweak script --- exercises/kernel-matrix-transpose-local-array_solution.cpp | 4 ---- exercises/kernel-matrix-transpose_solution.cpp | 7 ------- scripts/clang-format-on-off.py | 3 --- 3 files changed, 14 deletions(-) diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index b1b02a6107..60a1271d34 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -56,19 +56,15 @@ constexpr int DIM = 2; // // Function for checking results // -// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); -// clang-format on // // Function for printing results // -// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); -// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp index e988bc7784..691c895344 100644 --- a/exercises/kernel-matrix-transpose_solution.cpp +++ b/exercises/kernel-matrix-transpose_solution.cpp @@ -34,19 +34,15 @@ constexpr int DIM = 2; // // Function for checking results // -// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c); -// clang-format on // // Function for printing results // -// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c); -// clang-format on int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -220,7 +216,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to check result and report P/F. // -// clang-format off template void checkResult(RAJA::View> Atview, int N_r, int N_c) { @@ -239,11 +234,9 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c) } }; -// clang-format on // // Function to print result. // -// clang-format off template void printResult(RAJA::View> Atview, int N_r, int N_c) { diff --git a/scripts/clang-format-on-off.py b/scripts/clang-format-on-off.py index 6d7c0453da..6eb8a5495b 100644 --- a/scripts/clang-format-on-off.py +++ b/scripts/clang-format-on-off.py @@ -19,8 +19,6 @@ def find_nested_template_declarations(inp : list[string], cutoff : int = 2) -> l num_closing_braces = 0 num_paren_open = inp[ptr1].count('(') num_paren_close = inp[ptr1].count(')') - # stack for tracking bracket closure - # bool for multiline template class invocation is_template_decl = ( (num_brackets > num_closing_brackets or @@ -59,7 +57,6 @@ def add_clang_format_comments(inp : list[string], insertions: list[tuple[int,int if not done_inserting and i == insertions[current_insertion_idx][0]: out.append(clang_format_off) out.append(inp[i]) - #print(i, insertions[current_insertion_idx][1] + 1) if not done_inserting and i == insertions[current_insertion_idx][1] + 1: out.append(clang_format_on) # we've handled one off-on pair, increment pointer in insertion array From 1ddca4128b3dc95ed75275abe7caaaf143c674b7 Mon Sep 17 00:00:00 2001 From: john bowen Date: Tue, 8 Oct 2024 15:30:19 -0700 Subject: [PATCH 4/5] apply formatting to examples and exercises --- .clang-format | 69 ++- CMakeLists.txt | 5 +- cmake/RAJAMacros.cmake | 68 ++ examples/dynamic-forall.cpp | 80 ++- examples/dynamic_mat_transpose.cpp | 180 +++--- examples/forall-param-reductions.cpp | 214 +++---- examples/forall_multi-reductions.cpp | 107 ++-- examples/jacobi.cpp | 274 ++++---- examples/kernel-dynamic-tile.cpp | 20 +- examples/launch-param-reductions.cpp | 178 +++--- examples/launch_flatten.cpp | 52 +- examples/launch_matrix-multiply.cpp | 400 ++++++------ examples/launch_reductions.cpp | 140 +++-- examples/memoryManager.hpp | 2 +- examples/multiview.cpp | 156 ++--- examples/omp-target-kernel.cpp | 34 +- examples/omp-target-ltimes.cpp | 142 +++-- examples/pi-reduce_vs_atomic.cpp | 138 +++-- examples/plugin/counter-plugin.cpp | 43 +- examples/plugin/test-plugin-dynamic.cpp | 9 +- examples/plugin/test-plugin.cpp | 10 +- examples/plugin/timer-plugin.cpp | 18 +- examples/raja-launch.cpp | 58 +- examples/red-black-gauss-seidel.cpp | 79 +-- examples/resource-dynamic-forall.cpp | 104 ++-- examples/resource-forall.cpp | 376 +++++------ examples/resource-kernel.cpp | 30 +- examples/resource-launch.cpp | 24 +- examples/resource-runtime-launch.cpp | 148 +++-- examples/tut_daxpy.cpp | 223 +++---- examples/tut_halo-exchange.cpp | 20 +- examples/tut_launch_basic.cpp | 165 ++--- examples/tut_matrix-multiply.cpp | 586 +++++++++--------- examples/wave-eqn.cpp | 115 ++-- exercises/atomic-histogram.cpp | 153 ++--- exercises/atomic-histogram_solution.cpp | 181 +++--- exercises/dot-product.cpp | 100 +-- exercises/dot-product_solution.cpp | 87 +-- .../kernel-matrix-transpose-local-array.cpp | 126 ++-- ...-matrix-transpose-local-array_solution.cpp | 112 ++-- exercises/kernel-matrix-transpose-tiled.cpp | 115 ++-- ...kernel-matrix-transpose-tiled_solution.cpp | 137 ++-- exercises/kernel-matrix-transpose.cpp | 61 +- .../kernel-matrix-transpose_solution.cpp | 106 ++-- exercises/kernelintro-execpols.cpp | 274 ++++---- exercises/kernelintro-execpols_solution.cpp | 294 ++++----- exercises/kernelintro-nested-loop-reorder.cpp | 141 +++-- ...rnelintro-nested-loop-reorder_solution.cpp | 159 ++--- .../launch-matrix-transpose-local-array.cpp | 89 +-- ...-matrix-transpose-local-array_solution.cpp | 88 +-- exercises/launch-matrix-transpose-tiled.cpp | 133 ++-- ...launch-matrix-transpose-tiled_solution.cpp | 88 +-- exercises/launch-matrix-transpose.cpp | 86 +-- .../launch-matrix-transpose_solution.cpp | 59 +- exercises/launchintro-execpols.cpp | 255 ++++---- exercises/launchintro-execpols_solution.cpp | 259 ++++---- exercises/memoryManager.hpp | 56 +- exercises/offset-layout-stencil.cpp | 244 ++++---- exercises/offset-layout-stencil_solution.cpp | 268 ++++---- .../permuted-layout-batch-matrix-multiply.cpp | 398 ++++++------ ...-layout-batch-matrix-multiply_solution.cpp | 223 +++---- exercises/reductions.cpp | 142 +++-- exercises/reductions_solution.cpp | 204 +++--- exercises/scan.cpp | 109 ++-- exercises/scan_solution.cpp | 129 ++-- exercises/segment-indexset-basics.cpp | 163 +++-- .../segment-indexset-basics_solution.cpp | 186 +++--- exercises/sort.cpp | 178 +++--- exercises/sort_solution.cpp | 185 +++--- exercises/tutorial_halfday/ex2_approx-pi.cpp | 105 ++-- .../ex2_approx-pi_solution.cpp | 119 ++-- .../tutorial_halfday/ex5_line-of-sight.cpp | 142 +++-- .../ex5_line-of-sight_solution.cpp | 177 +++--- .../ex6_stencil-offset-layout.cpp | 229 +++---- .../ex6_stencil-offset-layout_solution.cpp | 222 +++---- .../ex8_tiled-matrix-transpose.cpp | 77 +-- .../ex8_tiled-matrix-transpose_solution.cpp | 110 ++-- .../ex9_matrix-transpose-local-array.cpp | 54 +- ..._matrix-transpose-local-array_solution.cpp | 115 ++-- exercises/vector-addition.cpp | 203 +++--- exercises/vector-addition_solution.cpp | 217 +++---- exercises/vertexsum-indexset.cpp | 472 +++++++------- exercises/vertexsum-indexset_solution.cpp | 488 ++++++++------- exercises/view-layout.cpp | 469 +++++++------- exercises/view-layout_solution.cpp | 481 +++++++------- 85 files changed, 7236 insertions(+), 6299 deletions(-) diff --git a/.clang-format b/.clang-format index 1d2ad9a77f..b6fa54b233 100644 --- a/.clang-format +++ b/.clang-format @@ -1,27 +1,70 @@ -BasedOnStyle : google +BasedOnStyle : LLVM +# Indent formatting IndentWidth : 2 -BreakBeforeBraces : Linux +Language: Cpp +UseTab: Never KeepEmptyLinesAtTheStartOfBlocks : true MaxEmptyLinesToKeep : 2 AccessModifierOffset : -2 -UseTab: Never +# This must be off so that include order in RAJA is preserved +SortIncludes: false + +# Alignment of consecutive declarations, assignments etc +AlignConsecutiveAssignments : true +AlignConsecutiveDeclarations : false +AlignConsecutiveMacros : true +AlignTrailingComments : true +AlwaysBreakAfterDefinitionReturnType: false + +# Control curly brace placement +BreakBeforeBraces : Custom +BraceWrapping: + AfterCaseLabel: true + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: true + AfterUnion: true + AfterExternBlock: false + BeforeCatch: true + BeforeElse: true + BeforeLambdaBody: true + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false + +# Pointer alignment +DerivePointerAlignment: false +PointerAlignment: Left AllowShortIfStatementsOnASingleLine : true -ConstructorInitializerAllOnOneLineOrOnePerLine : true AllowShortFunctionsOnASingleLine : true AllowShortLoopsOnASingleLine : false -BinPackParameters : false +AllowAllArgumentsOnNextLine : true AllowAllParametersOfDeclarationOnNextLine : false AlignTrailingComments : true +BinPackArguments : true +BinPackParameters : false +ConstructorInitializerAllOnOneLineOrOnePerLine : true ColumnLimit : 80 -PenaltyBreakBeforeFirstCallParameter : 100 -PenaltyReturnTypeOnItsOwnLine : 65000 -PenaltyBreakString : 10 -# These improve formatting results but require clang 3.6/7 or higher -BreakBeforeBinaryOperators : None -AlignAfterOpenBracket: true -BinPackArguments : false +AlignAfterOpenBracket: Align AlignOperands : true AlwaysBreakTemplateDeclarations : true -Cpp11BracedListStyle : true +BreakBeforeBinaryOperators : None +SpaceBeforeCpp11BracedList: true +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: false +SpacesInConditionalStatement: false +SpacesInParentheses: false +SpacesInSquareBrackets: false diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b31cbe124..dbe5b3f113 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,7 +41,7 @@ project(RAJA LANGUAGES CXX C VERSION ${RAJA_LOADED}) set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PATH}) - +set(BLT_REQUIRED_CLANGFORMAT_VERSION "14" CACHE STRING "") include(cmake/SetupRajaOptions.cmake) cmake_minimum_required(VERSION 3.23) @@ -136,6 +136,9 @@ include(cmake/SetupCompilers.cmake) # Macros for building executables and libraries include (cmake/RAJAMacros.cmake) +# Configure `style` target for enforcing code style +raja_add_code_checks() + set (raja_sources src/AlignedRangeIndexSetBuilders.cpp src/DepGraphNode.cpp diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake index c412593db7..8a19001cc7 100644 --- a/cmake/RAJAMacros.cmake +++ b/cmake/RAJAMacros.cmake @@ -204,3 +204,71 @@ macro(raja_add_benchmark) NUM_OMP_THREADS ${arg_NUM_OMP_THREADS} COMMAND ${TEST_DRIVER} ${arg_NAME}) endmacro(raja_add_benchmark) + +##------------------------------------------------------------------------------ +## raja_add_code_checks() +## +## Adds code checks for all source files recursively in the RAJA repository. +## +## This creates the following parent build targets: +## check - Runs a non file changing style check and CppCheck +## style - In-place code formatting +## +## Creates various child build targets that follow this pattern: +## raja_ +## raja__ +##------------------------------------------------------------------------------ +macro(raja_add_code_checks) + + set(options) + set(singleValueArgs) + set(multiValueArgs) + + # Parse the arguments to the macro + cmake_parse_arguments(arg + "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN}) + + # Only do code checks if building raja by itself and not included in + # another project + if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") + # Create file globbing expressions that only include directories that contain source + # TODO(bowen) Add examples, exercises and benchmark to the list below + set(_base_dirs "RAJA" "examples" "exercises" "benchmark" "include" "src" "test") + set(_ext_expressions "*.cpp" "*.hpp" "*.inl" + "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh") + + set(_glob_expressions) + foreach(_exp ${_ext_expressions}) + foreach(_base_dir ${_base_dirs}) + list(APPEND _glob_expressions "${PROJECT_SOURCE_DIR}/${_base_dir}/${_exp}") + endforeach() + endforeach() + + # Glob for list of files to run code checks on + set(_sources) + file(GLOB_RECURSE _sources ${_glob_expressions}) + + # Filter out exclusions + #set(_exclude_expressions + # "${PROJECT_SOURCE_DIR}/axom/sidre/examples/lulesh2/*" + # "${PROJECT_SOURCE_DIR}/axom/slam/examples/lulesh2.0.3/*" + # "${PROJECT_SOURCE_DIR}/axom/slam/examples/tinyHydro/*") + #foreach(_exp ${_exclude_expressions}) + # list(FILTER _sources EXCLUDE REGEX ${_exp}) + #endforeach() +# + blt_add_code_checks(PREFIX RAJA + SOURCES ${_sources} + CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format + CPPCHECK_FLAGS --enable=all --inconclusive) + + # Set FOLDER property for code check targets + foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style) + set(_tgt ${arg_PREFIX}_${_suffix}) + if(TARGET ${_tgt}) + set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks") + endif() + endforeach() + endif() + +endmacro(raja_add_code_checks) diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp index 8cab9dddca..f98d4eb781 100644 --- a/examples/dynamic-forall.cpp +++ b/examples/dynamic-forall.cpp @@ -41,11 +41,13 @@ using policy_list = camp::list; // clang-format on -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the policy to run"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW( + "Usage ./dynamic-forall N, where N is the index of the policy to run"); } // @@ -57,58 +59,60 @@ int main(int argc, char *argv[]) const int pol = std::stoi(argv[1]); std::cout << "\n\nRAJA vector addition example...\n"; - std::cout << "Using policy # "<(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = -i; b[i] = i; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } // _cstyle_vector_add_end checkResult(c, N); -//printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// Example of dynamic policy selection for forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Example of dynamic policy selection for forall + //----------------------------------------------------------------------------// - //policy is chosen from the list - RAJA::expt::dynamic_forall(pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { - c[i] = a[i] + b[i]; - }); + // policy is chosen from the list + RAJA::expt::dynamic_forall(pol, RAJA::RangeSegment(0, N), + [=] RAJA_HOST_DEVICE(int i) + { c[i] = a[i] + b[i]; }); // _rajaseq_vector_add_end checkResult(c, N); -//printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// -// Clean up. -// + //----------------------------------------------------------------------------// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -124,12 +128,19 @@ int main(int argc, char *argv[]) void checkResult(int* res, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( res[i] != 0 ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (res[i] != 0) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -140,7 +151,8 @@ void checkResult(int* res, int len) void printResult(int* res, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << res[i] << std::endl; } std::cout << std::endl; diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp index 4f1ae550b5..b1a39a9b56 100644 --- a/examples/dynamic_mat_transpose.cpp +++ b/examples/dynamic_mat_transpose.cpp @@ -85,99 +85,110 @@ using launch_policy = RAJA::LaunchPolicy< * Define team policies. * Up to 3 dimension are supported: x,y,z */ -using outer0 = RAJA::LoopPolicy< - RAJA::seq_exec +using outer0 = RAJA::LoopPolicy; + >; using outer1 = RAJA::LoopPolicy< #if defined(RAJA_ENABLE_OPENMP) - RAJA::omp_for_exec + RAJA::omp_for_exec #else - RAJA::seq_exec + RAJA::seq_exec #endif #if defined(RAJA_ENABLE_CUDA) - , - RAJA::cuda_block_y_direct + , + RAJA::cuda_block_y_direct #endif #if defined(RAJA_ENABLE_HIP) - , - RAJA::hip_block_y_direct + , + RAJA::hip_block_y_direct #endif #if defined(RAJA_ENABLE_SYCL) - , - RAJA::sycl_group_1_direct + , + RAJA::sycl_group_1_direct #endif - >; + >; /* * Define thread policies. * Up to 3 dimension are supported: x,y,z */ -using inner0 = RAJA::LoopPolicy< - RAJA::seq_exec +using inner0 = RAJA::LoopPolicy; + >; using inner1 = RAJA::LoopPolicy; + >; -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW( + "Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); } // // Run time policy section is demonstrated in this example by specifying // kernel exection space as a command line argument (host or device). - // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device + // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose + // device // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW( + "Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<(N_r * N_c); - int *At = host_res.allocate(N_r * N_c); + int* A = host_res.allocate(N_r * N_c); + int* At = host_res.allocate(N_r * N_c); // // In the following implementations of matrix transpose, we // use RAJA 'View' objects to access the matrix data. A RAJA view @@ -227,12 +240,14 @@ int main(int argc, char *argv[]) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of shared matrix transpose...\n"; @@ -243,8 +258,10 @@ int main(int argc, char *argv[]) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -255,14 +272,17 @@ int main(int argc, char *argv[]) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -274,19 +294,21 @@ int main(int argc, char *argv[]) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _dynamic_mattranspose_localarray_cstyle_end @@ -296,24 +318,26 @@ int main(int argc, char *argv[]) //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n"; + std::cout + << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n"; - //Reset memory + // Reset memory std::memset(At, 0, N_r * N_c * sizeof(int)); #if defined(RAJA_GPU_ACTIVE) - //Allocate device side pointers + // Allocate device side pointers int *d_A = nullptr, *d_At = nullptr; - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { - d_A = device_res.allocate(N_r * N_c); + d_A = device_res.allocate(N_r * N_c); d_At = device_res.allocate(N_r * N_c); device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c); device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c); - //switch host/device pointers so we can reuse the views + // switch host/device pointers so we can reuse the views Aview.set_data(d_A); Atview.set_data(d_At); } @@ -324,7 +348,7 @@ int main(int argc, char *argv[]) // _dynamic_mattranspose_shared_mem_end // _dynamic_mattranspose_kernel_start -// clang-format off + // clang-format off RAJA::launch (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr), RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size), @@ -381,10 +405,11 @@ int main(int argc, char *argv[]) }); }); // _dynamic_mattranspose_kernel_end -// clang-format on + // clang-format on #if defined(RAJA_GPU_ACTIVE) - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c); device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c); @@ -396,15 +421,16 @@ int main(int argc, char *argv[]) checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// - //Release data + // Release data host_res.deallocate(A); host_res.deallocate(At); #if defined(RAJA_GPU_ACTIVE) - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { device_res.deallocate(d_A); device_res.deallocate(d_At); } @@ -445,11 +471,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - //std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) //<< std::endl; - printf("%d ",Atview(row, col)); + printf("%d ", Atview(row, col)); } std::cout << "" << std::endl; } diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp index dddd2d2459..f103365c62 100644 --- a/examples/forall-param-reductions.cpp +++ b/examples/forall-param-reductions.cpp @@ -47,81 +47,87 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // RAJA::resources::Host host_res; int* a = host_res.allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set a[0] to a different value. Total sum should be 2. -// + // + // Set a[0] to a different value. Total sum should be 2. + // a[0] = 3; -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; - a[minloc_ref] = -100; + a[minloc_ref] = -100; constexpr int maxloc_ref = N / 2 + 1; - a[maxloc_ref] = 100; + a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be two -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be two + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::TypedRangeSegment arange(0, N); // _reductions_range_end -// -// Define ValLoc Type -// + // + // Define ValLoc Type + // using VALLOC_INT = RAJA::expt::ValLoc; -// -// Define ValOp Types -// + // + // Define ValOp Types + // using VALOP_INT_SUM = RAJA::expt::ValOp; using VALOP_INT_MIN = RAJA::expt::ValOp; using VALOP_INT_MAX = RAJA::expt::ValOp; - using VALOPLOC_INT_MIN = RAJA::expt::ValLocOp; - using VALOPLOC_INT_MAX = RAJA::expt::ValLocOp; + using VALOPLOC_INT_MIN = + RAJA::expt::ValLocOp; + using VALOPLOC_INT_MAX = + RAJA::expt::ValLocOp; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; // _reductions_raja_seq_start - using EXEC_POL1 = RAJA::seq_exec; + using EXEC_POL1 = RAJA::seq_exec; int seq_sum = 0; int seq_min = std::numeric_limits::max(); @@ -134,7 +140,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type seq_minloc2(-1); RAJA::Index_type seq_maxloc2(-1); -// clang-format off + // clang-format off RAJA::forall(host_res, arange, RAJA::expt::Reduce(&seq_sum), RAJA::expt::Reduce(&seq_min), @@ -165,28 +171,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << seq_sum << std::endl; std::cout << "\tmin = " << seq_min << std::endl; std::cout << "\tmax = " << seq_max << std::endl; std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " - << seq_minloc.getLoc() << std::endl; + << seq_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " - << seq_maxloc.getLoc() << std::endl; - std::cout << "\tmin2, loc2 = " << seq_min2 << " , " - << seq_minloc2 << std::endl; - std::cout << "\tmax2, loc2 = " << seq_max2 << " , " - << seq_maxloc2 << std::endl; + << seq_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << seq_min2 << " , " << seq_minloc2 + << std::endl; + std::cout << "\tmax2, loc2 = " << seq_max2 << " , " << seq_maxloc2 + << std::endl; // _reductions_raja_seq_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; // _reductions_raja_omppolicy_start - using EXEC_POL2 = RAJA::omp_parallel_for_exec; + using EXEC_POL2 = RAJA::omp_parallel_for_exec; // _reductions_raja_omppolicy_end int omp_sum = 0; @@ -200,7 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type omp_minloc2(-1); RAJA::Index_type omp_maxloc2(-1); -// clang-format off + // clang-format off RAJA::forall(host_res, arange, RAJA::expt::Reduce(&omp_sum), RAJA::expt::Reduce(&omp_min), @@ -231,22 +237,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << omp_sum << std::endl; std::cout << "\tmin = " << omp_min << std::endl; std::cout << "\tmax = " << omp_max << std::endl; std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " - << omp_minloc.getLoc() << std::endl; + << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " - << omp_maxloc.getLoc() << std::endl; - std::cout << "\tmin2, loc2 = " << omp_min2 << " , " - << omp_minloc2 << std::endl; - std::cout << "\tmax2, loc2 = " << omp_max2 << " , " - << omp_maxloc2 << std::endl; + << omp_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << omp_min2 << " , " << omp_minloc2 + << std::endl; + std::cout << "\tmax2, loc2 = " << omp_max2 << " , " << omp_maxloc2 + << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_TARGET_OPENMP) std::cout << "\n Running RAJA OpenMP Target reductions...\n"; @@ -254,7 +260,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::resources::Omp omp_res; // _reductions_raja_omppolicy_start - using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt; + using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt; // _reductions_raja_omppolicy_end int omp_t_sum = 0; @@ -268,7 +274,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type omp_t_minloc2(-1); RAJA::Index_type omp_t_maxloc2(-1); -// clang-format off + // clang-format off RAJA::forall(omp_res, arange, RAJA::expt::Reduce(&omp_t_sum), RAJA::expt::Reduce(&omp_t_min), @@ -299,23 +305,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << omp_t_sum << std::endl; std::cout << "\tmin = " << omp_t_min << std::endl; std::cout << "\tmax = " << omp_t_max << std::endl; std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , " - << omp_t_minloc.getLoc() << std::endl; + << omp_t_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " - << omp_t_maxloc.getLoc() << std::endl; - std::cout << "\tmin2, loc2 = " << omp_t_min2 << " , " - << omp_t_minloc2 << std::endl; - std::cout << "\tmax2, loc2 = " << omp_t_max2 << " , " - << omp_t_maxloc2 << std::endl; + << omp_t_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << omp_t_min2 << " , " << omp_t_minloc2 + << std::endl; + std::cout << "\tmax2, loc2 = " << omp_t_max2 << " , " << omp_t_maxloc2 + << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; @@ -326,7 +332,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) cuda_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_cudapolicy_start - using EXEC_POL3 = RAJA::cuda_exec; + using EXEC_POL3 = RAJA::cuda_exec; // _reductions_raja_cudapolicy_end int cuda_sum = 0; @@ -340,7 +346,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type cuda_minloc2(-1); RAJA::Index_type cuda_maxloc2(-1); -// clang-format off + // clang-format off RAJA::forall(cuda_res, arange, RAJA::expt::Reduce(&cuda_sum), RAJA::expt::Reduce(&cuda_min), @@ -371,22 +377,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << cuda_sum << std::endl; std::cout << "\tmin = " << cuda_min << std::endl; std::cout << "\tmax = " << cuda_max << std::endl; std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " - << cuda_minloc.getLoc() << std::endl; + << cuda_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " - << cuda_maxloc.getLoc() << std::endl; - std::cout << "\tmin2, loc2 = " << cuda_min2 << " , " - << cuda_minloc2 << std::endl; - std::cout << "\tmax2, loc2 = " << cuda_max2 << " , " - << cuda_maxloc2 << std::endl; + << cuda_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << cuda_min2 << " , " << cuda_minloc2 + << std::endl; + std::cout << "\tmax2, loc2 = " << cuda_max2 << " , " << cuda_maxloc2 + << std::endl; cuda_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; @@ -397,7 +403,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hip_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_hippolicy_start - using EXEC_POL3 = RAJA::hip_exec; + using EXEC_POL3 = RAJA::hip_exec; // _reductions_raja_hippolicy_end int hip_sum = 0; @@ -411,7 +417,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type hip_minloc2(-1); RAJA::Index_type hip_maxloc2(-1); -// clang-format off + // clang-format off RAJA::forall(hip_res, arange, RAJA::expt::Reduce(&hip_sum), RAJA::expt::Reduce(&hip_min), @@ -442,23 +448,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << hip_sum << std::endl; std::cout << "\tmin = " << hip_min << std::endl; std::cout << "\tmax = " << hip_max << std::endl; std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " - << hip_maxloc.getLoc() << std::endl; - std::cout << "\tmin2, loc2 = " << hip_min2 << " , " - << hip_minloc2 << std::endl; - std::cout << "\tmax2, loc2 = " << hip_max2 << " , " - << hip_maxloc2 << std::endl; + << hip_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << hip_min2 << " , " << hip_minloc2 + << std::endl; + std::cout << "\tmax2, loc2 = " << hip_max2 << " , " << hip_maxloc2 + << std::endl; hip_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL reductions...\n"; @@ -469,7 +475,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) sycl_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_syclpolicy_start - using EXEC_POL3 = RAJA::sycl_exec; + using EXEC_POL3 = RAJA::sycl_exec; // _reductions_raja_syclpolicy_end int sycl_sum = 0; @@ -483,7 +489,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type sycl_minloc2(-1); RAJA::Index_type sycl_maxloc2(-1); -// clang-format off + // clang-format off RAJA::forall(sycl_res, arange, RAJA::expt::Reduce(&sycl_sum), RAJA::expt::Reduce(&sycl_min), @@ -514,27 +520,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << sycl_sum << std::endl; std::cout << "\tmin = " << sycl_min << std::endl; std::cout << "\tmax = " << sycl_max << std::endl; std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , " - << sycl_minloc.getLoc() << std::endl; + << sycl_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , " - << sycl_maxloc.getLoc() << std::endl; - std::cout << "\tmin2, loc2 = " << sycl_min2 << " , " - << sycl_minloc2 << std::endl; - std::cout << "\tmax2, loc2 = " << sycl_max2 << " , " - << sycl_maxloc2 << std::endl; + << sycl_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << sycl_min2 << " , " << sycl_minloc2 + << std::endl; + std::cout << "\tmax2, loc2 = " << sycl_max2 << " , " << sycl_maxloc2 + << std::endl; sycl_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // host_res.deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp index 8dad6d8ab5..302d760937 100644 --- a/examples/forall_multi-reductions.cpp +++ b/examples/forall_multi-reductions.cpp @@ -27,10 +27,10 @@ * */ -template < typename t_exec_policy, typename t_multi_reduce_policy > +template struct Backend { - using exec_policy = t_exec_policy; + using exec_policy = t_exec_policy; using multi_reduce_policy = t_multi_reduce_policy; std::string name; @@ -56,34 +56,34 @@ auto example_policies = camp::make_tuple( ); // clang-format on -template < typename exec_policy, typename multi_reduce_policy > +template void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a) { - RAJA::MultiReduceSum multi_reduce_sum(num_bins); - RAJA::MultiReduceMin multi_reduce_min(num_bins); - RAJA::MultiReduceMax multi_reduce_max(num_bins); + RAJA::MultiReduceSum multi_reduce_sum(num_bins); + RAJA::MultiReduceMin multi_reduce_min(num_bins); + RAJA::MultiReduceMax multi_reduce_max(num_bins); RAJA::MultiReduceBitAnd multi_reduce_and(num_bins); - RAJA::MultiReduceBitOr multi_reduce_or(num_bins); + RAJA::MultiReduceBitOr multi_reduce_or(num_bins); RAJA::forall(arange, - [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { - - int bin = bins[i]; - - multi_reduce_sum[bin] += a[i]; - multi_reduce_min[bin].min(a[i]); - multi_reduce_max[bin].max(a[i]); - multi_reduce_and[bin] &= a[i]; - multi_reduce_or [bin] |= a[i]; - - }); - - for (int bin = 0; bin < num_bins; ++bin) { + [=] RAJA_HOST_DEVICE(RAJA::Index_type i) + { + int bin = bins[i]; + + multi_reduce_sum[bin] += a[i]; + multi_reduce_min[bin].min(a[i]); + multi_reduce_max[bin].max(a[i]); + multi_reduce_and[bin] &= a[i]; + multi_reduce_or[bin] |= a[i]; + }); + + for (int bin = 0; bin < num_bins; ++bin) + { std::cout << "\tsum[" << bin << "] = " << multi_reduce_sum.get(bin) << '\n'; std::cout << "\tmin[" << bin << "] = " << multi_reduce_min.get(bin) << '\n'; std::cout << "\tmax[" << bin << "] = " << multi_reduce_max.get(bin) << '\n'; std::cout << "\tand[" << bin << "] = " << multi_reduce_and.get(bin) << '\n'; - std::cout << "\tor [" << bin << "] = " << multi_reduce_or .get(bin) << '\n'; + std::cout << "\tor [" << bin << "] = " << multi_reduce_or.get(bin) << '\n'; std::cout << '\n'; } } @@ -92,48 +92,49 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) { // _multi_reductions_array_init_start -// -// Define array length -// - const int N = 1000000; + // + // Define array length + // + const int N = 1000000; const int num_bins = 10; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // camp::resources::Host host_res; int* host_bins = host_res.template allocate(N); int* host_a = host_res.template allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { host_bins[i] = i % num_bins; - host_a[i] = (i % (2*num_bins)) - num_bins; + host_a[i] = (i % (2 * num_bins)) - num_bins; } // _multi_reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// for bin in [0, num_bins) -// - the sum will be (bin - num_bins/2) * N / num_bins -// - the min will be bin - num_bins -// - the max will be bin -// - the and will be min & max -// - the or will be min | max -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // for bin in [0, num_bins) + // - the sum will be (bin - num_bins/2) * N / num_bins + // - the min will be bin - num_bins + // - the max will be bin + // - the and will be min & max + // - the or will be min | max + // + + // + // Define index range for iterating over a elements in all examples + // // _multi_reductions_range_start RAJA::RangeSegment arange(0, N); // _multi_reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// clang-format off + // clang-format off RAJA::for_each_tuple(example_policies, [&](auto const& backend) { std::cout << "Running " << backend.name << " policies" << '\n'; @@ -157,14 +158,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) std::cout << std::endl; }); -// clang-format on -//----------------------------------------------------------------------------// + // clang-format on + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // host_res.deallocate(host_bins); - host_res.deallocate(host_a ); + host_res.deallocate(host_a); std::cout << "\n DONE!...\n"; diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp index e86632b427..30a9f6bb76 100644 --- a/examples/jacobi.cpp +++ b/examples/jacobi.cpp @@ -39,7 +39,7 @@ * (I, Iold) and initialized to zero. The first set of * nested for loops apply an iteration of the Jacobi * scheme. The scheme is only applied to the interior - * nodes. + * nodes. * * The second set of nested for loops is used to * update Iold and compute the l_2 norm of the @@ -52,7 +52,7 @@ * ----[RAJA Concepts]--------------- * - Forall::nested loop * - RAJA Reduction - * + * */ @@ -63,9 +63,9 @@ * * CUDA_BLOCK_SIZE_Y - Number of threads in the * y-dimension of a cuda thread block - * + * * CUDA_BLOCK_SIZE - Number of threads per threads block -*/ + */ #if defined(RAJA_ENABLE_CUDA) const int CUDA_BLOCK_SIZE = 256; #endif @@ -80,23 +80,24 @@ const int HIP_BLOCK_SIZE = 256; // h - Spacing between grid points // n - Number of grid points // -struct grid_s { +struct grid_s +{ double o, h; int n; }; -// +// // ----[Functions]--------- // solution - Function for the analytic solution // computeErr - Displays the maximum error in the solution // double solution(double x, double y); -void computeErr(double *I, grid_s grid); +void computeErr(double* I, grid_s grid); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout<<"Jacobi Example"<(NN); - double *Iold = memoryManager::allocate(NN); + double* I = memoryManager::allocate(NN); + double* Iold = memoryManager::allocate(NN); memset(I, 0, NN * sizeof(double)); @@ -135,26 +136,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("Standard C++ Loop \n"); - resI2 = 1; + resI2 = 1; iteration = 0; - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { // // Jacobi Iteration // - for (int n = 1; n <= N; ++n) { - for (int m = 1; m <= N; ++m) { + for (int n = 1; n <= N; ++n) + { + for (int m = 1; m <= N; ++m) + { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); int id = n * (N + 2) + m; - I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] - + Iold[id + 1]); + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); } } @@ -162,12 +166,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Compute residual and update Iold // resI2 = 0.0; - for (int k = 0; k < NN; k++) { + for (int k = 0; k < NN; k++) + { resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); Iold[k] = I[k]; } - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("Standard C++ Loop - Maxed out on iterations \n"); exit(-1); } @@ -184,54 +190,56 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::RangeSegment gridRange(0, NN); RAJA::RangeSegment jacobiRange(1, (N + 1)); -// clang-format off + // clang-format off using jacobiSeqNestedPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; -// clang-format on + // clang-format on printf("RAJA: Sequential Policy - Nested ForallN \n"); - resI2 = 1; + resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); memset(Iold, 0, NN * sizeof(double)); /* - * Sequential Jacobi Iteration. + * Sequential Jacobi Iteration. * * Note that a RAJA ReduceSum object is used to accumulate the sum - * for the residual. Since the loop is run sequentially, this is - * not strictly necessary. It is done here for consistency and + * for the residual. Since the loop is run sequentially, this is + * not strictly necessary. It is done here for consistency and * comparison with other RAJA variants in this example. - */ - while (resI2 > tol * tol) { + */ + while (resI2 > tol * tol) + { - RAJA::kernel(RAJA::make_tuple(jacobiRange,jacobiRange), - [=] (RAJA::Index_type m, RAJA::Index_type n) { - + RAJA::kernel( + RAJA::make_tuple(jacobiRange, jacobiRange), + [=](RAJA::Index_type m, RAJA::Index_type n) + { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); int id = n * (N + 2) + m; - I[id] = - 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] - + Iold[id + 1]); + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); }); RAJA::ReduceSum RAJA_resI2(0.0); - RAJA::forall( - gridRange, [=](RAJA::Index_type k) { - - RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); - Iold[k] = I[k]; + RAJA::forall(gridRange, + [=](RAJA::Index_type k) + { + RAJA_resI2 += + (I[k] - Iold[k]) * (I[k] - Iold[k]); + Iold[k] = I[k]; + }); - }); - resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("Jacobi: Sequential - Maxed out on iterations! \n"); exit(-1); } @@ -239,17 +247,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } computeErr(I, gridx); printf("No of iterations: %d \n \n", iteration); - - + + #if defined(RAJA_ENABLE_OPENMP) printf("RAJA: OpenMP Policy - Nested ForallN \n"); - resI2 = 1; + resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); memset(Iold, 0, NN * sizeof(double)); - + /* - * OpenMP parallel Jacobi Iteration. + * OpenMP parallel Jacobi Iteration. * * ----[RAJA Policies]----------- * RAJA::omp_collapse_for_exec - @@ -258,43 +266,45 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) * Note that OpenMP RAJA ReduceSum object performs the reduction * operation for the residual in a thread-safe manner. */ - -// clang-format off + + // clang-format off using jacobiOmpNestedPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; -// clang-format on - while (resI2 > tol * tol) { - - RAJA::kernel(RAJA::make_tuple(jacobiRange,jacobiRange), - [=] (RAJA::Index_type m, RAJA::Index_type n) { + // clang-format on + while (resI2 > tol * tol) + { - - double x = gridx.o + m * gridx.h; - double y = gridx.o + n * gridx.h; + RAJA::kernel( + RAJA::make_tuple(jacobiRange, jacobiRange), + [=](RAJA::Index_type m, RAJA::Index_type n) + { + double x = gridx.o + m * gridx.h; + double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h * - (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); - int id = n * (N + 2) + m; - I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + - Iold[id - 1] + Iold[id + 1]); - }); + int id = n * (N + 2) + m; + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); + }); RAJA::ReduceSum RAJA_resI2(0.0); - RAJA::forall( gridRange, - [=](RAJA::Index_type k) { - - RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); - Iold[k] = I[k]; - - }); - + RAJA::forall(gridRange, + [=](RAJA::Index_type k) + { + RAJA_resI2 += (I[k] - Iold[k]) * + (I[k] - Iold[k]); + Iold[k] = I[k]; + }); + resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("Jacobi: OpenMP - Maxed out on iterations! \n"); exit(-1); } @@ -307,7 +317,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) /* - * CUDA Jacobi Iteration. + * CUDA Jacobi Iteration. * * ----[RAJA Policies]----------- * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - @@ -319,7 +329,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("RAJA: CUDA Policy - Nested ForallN \n"); -// clang-format off + // clang-format off using jacobiCUDANestedPolicy = RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, @@ -332,38 +342,39 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > > > >; - -// clang-format on - resI2 = 1; + + // clang-format on + resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); memset(Iold, 0, NN * sizeof(double)); - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { // - // Jacobi Iteration + // Jacobi Iteration // RAJA::kernel( - RAJA::make_tuple(jacobiRange,jacobiRange), - [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) { - + RAJA::make_tuple(jacobiRange, jacobiRange), + [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) + { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); int id = n * (N + 2) + m; - I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] - + Iold[id + 1]); + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); }); // // Compute residual and update Iold // RAJA::ReduceSum RAJA_resI2(0.0); -// clang-format off + // clang-format off RAJA::forall>( gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { @@ -372,10 +383,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("RAJA: CUDA - Maxed out on iterations! \n"); exit(-1); } @@ -400,7 +412,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("RAJA: HIP Policy - Nested ForallN \n"); -// clang-format off + // clang-format off using jacobiHIPNestedPolicy = RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, @@ -414,42 +426,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > > >; -// clang-format on - resI2 = 1; + // clang-format on + resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); memset(Iold, 0, NN * sizeof(double)); - double *d_I = memoryManager::allocate_gpu(NN); - double *d_Iold = memoryManager::allocate_gpu(NN); - hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice )); + double* d_I = memoryManager::allocate_gpu(NN); + double* d_Iold = memoryManager::allocate_gpu(NN); + hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice)); - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { // // Jacobi Iteration // RAJA::kernel( - RAJA::make_tuple(jacobiRange,jacobiRange), - [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) { - + RAJA::make_tuple(jacobiRange, jacobiRange), + [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) + { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); - int id = n * (N + 2) + m; - d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] - + d_Iold[id + 1]); + int id = n * (N + 2) + m; + d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + + d_Iold[id - 1] + d_Iold[id + 1]); }); // // Compute residual and update Iold // RAJA::ReduceSum RAJA_resI2(0.0); -// clang-format off + // clang-format off RAJA::forall>( gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { @@ -458,17 +472,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("RAJA: HIP - Maxed out on iterations! \n"); exit(-1); } iteration++; } hipDeviceSynchronize(); - hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost)); computeErr(I, gridx); printf("No of iterations: %d \n \n", iteration); @@ -478,7 +493,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate(I); memoryManager::deallocate(Iold); - + return 0; } @@ -494,27 +509,28 @@ double solution(double x, double y) // // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf} // -void computeErr(double *I, grid_s grid) +void computeErr(double* I, grid_s grid) { RAJA::RangeSegment gridRange(0, grid.n); RAJA::ReduceMax tMax(-1.0); -// clang-format off + // clang-format off using jacobiSeqNestedPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; -// clang-format on - RAJA::kernel(RAJA::make_tuple(gridRange,gridRange), - [=] (RAJA::Index_type ty, RAJA::Index_type tx ) { - - int id = tx + grid.n * ty; - double x = grid.o + tx * grid.h; - double y = grid.o + ty * grid.h; - double myErr = std::abs(I[id] - solution(x, y)); - tMax.max(myErr); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(gridRange, gridRange), + [=](RAJA::Index_type ty, RAJA::Index_type tx) + { + int id = tx + grid.n * ty; + double x = grid.o + tx * grid.h; + double y = grid.o + ty * grid.h; + double myErr = std::abs(I[id] - solution(x, y)); + tMax.max(myErr); + }); double l2err = tMax; printf("Max error = %lg, h = %f \n", l2err, grid.h); diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp index 80cec6cbdb..cc0b0d71f4 100644 --- a/examples/kernel-dynamic-tile.cpp +++ b/examples/kernel-dynamic-tile.cpp @@ -1,20 +1,20 @@ #include "RAJA/RAJA.hpp" -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA dynamic_tile example...\n\n"; -//Using policy = KernelPolicy, seq_exec, …>>; -//RAJA::kernel_param( -// make_tuple(RangeSegment(0,N)), -// make_tuple(32), // param 0 is referenced by tile_dynamic -// [=](int i, int tile_size){ -// -// }); + // Using policy = KernelPolicy, seq_exec, …>>; + // RAJA::kernel_param( + // make_tuple(RangeSegment(0,N)), + // make_tuple(32), // param 0 is referenced by tile_dynamic + // [=](int i, int tile_size){ + // + // }); using namespace RAJA; -// clang-format off + // clang-format off kernel_param< KernelPolicy< statement::Tile<1, tile_dynamic<1>, seq_exec, @@ -32,5 +32,5 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "Running index (" << i << "," << j << ") of " << x.size << "x" << y.size << " tile." << std::endl; }); -// clang-format on + // clang-format on } diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp index 6280963c6f..3e9c09029e 100644 --- a/examples/launch-param-reductions.cpp +++ b/examples/launch-param-reductions.cpp @@ -38,7 +38,7 @@ constexpr int HIP_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_SYCL) -//LC testing hardware has a limit of 151 +// LC testing hardware has a limit of 151 constexpr int SYCL_BLOCK_SIZE = 128; #endif @@ -48,14 +48,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Use a resource to allocate memory -// + // + // Use a resource to allocate memory + // RAJA::resources::Host host_res; #if defined(RAJA_ENABLE_CUDA) RAJA::resources::Cuda device_res; @@ -68,76 +68,82 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = host_res.allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set a[0] to a different value. Total sum should be 2. -// + // + // Set a[0] to a different value. Total sum should be 2. + // a[0] = 3; -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; - a[minloc_ref] = -100; + a[minloc_ref] = -100; constexpr int maxloc_ref = N / 2 + 1; - a[maxloc_ref] = 100; + a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be two -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be two + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::TypedRangeSegment arange(0, N); // _reductions_range_end -// -// Define ValLoc Type -// + // + // Define ValLoc Type + // using VALLOC_INT = RAJA::expt::ValLoc; -// -// Define ValOp Types -// + // + // Define ValOp Types + // using VALOP_INT_SUM = RAJA::expt::ValOp; using VALOP_INT_MIN = RAJA::expt::ValOp; using VALOP_INT_MAX = RAJA::expt::ValOp; - using VALOPLOC_INT_MIN = RAJA::expt::ValLocOp; - using VALOPLOC_INT_MAX = RAJA::expt::ValLocOp; + using VALOPLOC_INT_MIN = + RAJA::expt::ValLocOp; + using VALOPLOC_INT_MAX = + RAJA::expt::ValLocOp; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; // _reductions_raja_seq_start - using LAUNCH_POL1 = RAJA::LaunchPolicy; - using LOOP_POL1 = RAJA::LoopPolicy; + using LAUNCH_POL1 = RAJA::LaunchPolicy; + using LOOP_POL1 = RAJA::LoopPolicy; int seq_sum = 0; int seq_min = std::numeric_limits::max(); @@ -150,7 +156,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type seq_minloc2(-1); RAJA::Index_type seq_maxloc2(-1); -// clang-format off + // clang-format off RAJA::launch (host_res, RAJA::LaunchParams(), "SeqReductionKernel", RAJA::expt::Reduce(&seq_sum), @@ -187,25 +193,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << seq_sum << std::endl; std::cout << "\tmin = " << seq_min << std::endl; std::cout << "\tmax = " << seq_max << std::endl; std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " - << seq_minloc.getLoc() << std::endl; + << seq_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " - << seq_maxloc.getLoc() << std::endl; + << seq_maxloc.getLoc() << std::endl; // _reductions_raja_seq_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; // _reductions_raja_omppolicy_start - using LAUNCH_POL2 = RAJA::LaunchPolicy; - using LOOP_POL2 = RAJA::LoopPolicy; + using LAUNCH_POL2 = RAJA::LaunchPolicy; + using LOOP_POL2 = RAJA::LoopPolicy; // _reductions_raja_omppolicy_end int omp_sum = 0; @@ -219,7 +225,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type omp_minloc2(-1); RAJA::Index_type omp_maxloc2(-1); -// clang-format off + // clang-format off RAJA::launch (host_res, RAJA::LaunchParams(), "OmpReductionKernel", RAJA::expt::Reduce(&omp_sum), @@ -256,18 +262,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << omp_sum << std::endl; std::cout << "\tmin = " << omp_min << std::endl; std::cout << "\tmax = " << omp_max << std::endl; std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " - << omp_minloc.getLoc() << std::endl; + << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " - << omp_maxloc.getLoc() << std::endl; + << omp_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; @@ -276,11 +282,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) device_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_cudapolicy_start - using LAUNCH_POL3 = RAJA::LaunchPolicy>; - using LOOP_POL3 = RAJA::LoopPolicy; + using LAUNCH_POL3 = RAJA::LaunchPolicy>; + using LOOP_POL3 = RAJA::LoopPolicy; // _reductions_raja_cudapolicy_end - const int NUMBER_OF_TEAMS = (N-1)/CUDA_BLOCK_SIZE + 1; + const int NUMBER_OF_TEAMS = (N - 1) / CUDA_BLOCK_SIZE + 1; int cuda_sum = 0; int cuda_min = std::numeric_limits::max(); @@ -293,7 +299,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type cuda_minloc2(-1); RAJA::Index_type cuda_maxloc2(-1); -// clang-format off + // clang-format off RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)), "CUDAReductionKernel", @@ -334,19 +340,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << cuda_sum << std::endl; std::cout << "\tmin = " << cuda_min << std::endl; std::cout << "\tmax = " << cuda_max << std::endl; std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " - << cuda_minloc.getLoc() << std::endl; + << cuda_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " - << cuda_maxloc.getLoc() << std::endl; + << cuda_maxloc.getLoc() << std::endl; device_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; @@ -355,11 +361,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) device_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_hippolicy_start - using LAUNCH_POL3 = RAJA::LaunchPolicy>; - using LOOP_POL3 = RAJA::LoopPolicy; + using LAUNCH_POL3 = RAJA::LaunchPolicy>; + using LOOP_POL3 = RAJA::LoopPolicy; // _reductions_raja_hippolicy_end - const int NUMBER_OF_TEAMS = (N-1)/HIP_BLOCK_SIZE + 1; + const int NUMBER_OF_TEAMS = (N - 1) / HIP_BLOCK_SIZE + 1; int hip_sum = 0; int hip_min = std::numeric_limits::max(); @@ -372,7 +378,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type hip_minloc2(-1); RAJA::Index_type hip_maxloc2(-1); -// clang-format off + // clang-format off RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)), "HipReductionKernel", @@ -410,19 +416,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << hip_sum << std::endl; std::cout << "\tmin = " << hip_min << std::endl; std::cout << "\tmax = " << hip_max << std::endl; std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " - << hip_maxloc.getLoc() << std::endl; + << hip_maxloc.getLoc() << std::endl; device_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL reductions...\n"; @@ -431,11 +437,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) device_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_syclpolicy_start - using LAUNCH_POL4 = RAJA::LaunchPolicy>; - using LOOP_POL4 = RAJA::LoopPolicy; + using LAUNCH_POL4 = RAJA::LaunchPolicy>; + using LOOP_POL4 = RAJA::LoopPolicy; // _reductions_raja_syclpolicy_end - const int NUMBER_OF_TEAMS = (N-1)/SYCL_BLOCK_SIZE + 1; + const int NUMBER_OF_TEAMS = (N - 1) / SYCL_BLOCK_SIZE + 1; int sycl_sum = 0; int sycl_min = std::numeric_limits::max(); @@ -448,7 +454,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type sycl_minloc2(-1); RAJA::Index_type sycl_maxloc2(-1); -// clang-format off + // clang-format off RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)), "SyclReductionKernel", @@ -486,23 +492,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on std::cout << "\tsum = " << sycl_sum << std::endl; std::cout << "\tmin = " << sycl_min << std::endl; std::cout << "\tmax = " << sycl_max << std::endl; std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , " - << sycl_minloc.getLoc() << std::endl; + << sycl_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , " - << sycl_maxloc.getLoc() << std::endl; + << sycl_maxloc.getLoc() << std::endl; device_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // host_res.deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp index e56461506d..71449ea771 100644 --- a/examples/launch_flatten.cpp +++ b/examples/launch_flatten.cpp @@ -34,16 +34,18 @@ */ #if defined(RAJA_ENABLE_CUDA) -using device_launch = RAJA::LaunchPolicy>; -using device_inner_pol0 = RAJA::LoopPolicy; -using device_inner_pol1 = RAJA::LoopPolicy; -using device_flatten_pol = RAJA::LoopPolicy; +using device_launch = RAJA::LaunchPolicy>; +using device_inner_pol0 = RAJA::LoopPolicy; +using device_inner_pol1 = RAJA::LoopPolicy; +using device_flatten_pol = + RAJA::LoopPolicy; using reduce_policy = RAJA::cuda_reduce; #elif defined(RAJA_ENABLE_HIP) -using device_launch = RAJA::LaunchPolicy>; -using device_inner_pol0 = RAJA::LoopPolicy; -using device_inner_pol1 = RAJA::LoopPolicy; -using device_flatten_pol = RAJA::LoopPolicy; +using device_launch = RAJA::LaunchPolicy>; +using device_inner_pol0 = RAJA::LoopPolicy; +using device_inner_pol1 = RAJA::LoopPolicy; +using device_flatten_pol = + RAJA::LoopPolicy; using reduce_policy = RAJA::hip_reduce; #endif @@ -52,9 +54,9 @@ using reduce_policy = RAJA::hip_reduce; */ using host_launch = RAJA::LaunchPolicy; -using host_loop = RAJA::LoopPolicy; +using host_loop = RAJA::LoopPolicy; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) @@ -62,21 +64,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Problem size dimensions // - constexpr int N = 4; - constexpr int NN = N*N; + constexpr int N = 4; + constexpr int NN = N * N; // // Configure grid size // - RAJA::LaunchParams launch_params(RAJA::Teams(1), - RAJA::Threads(N, N)); + RAJA::LaunchParams launch_params(RAJA::Teams(1), RAJA::Threads(N, N)); // // Resource object for host, used to allocate memory // camp::resources::Host host_res; - int *h_A_ptr = host_res.allocate(NN); + int* h_A_ptr = host_res.allocate(NN); // // Resource object for device, used to allocate memory @@ -87,9 +88,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) camp::resources::Hip device_res; #endif - int *d_A_ptr = device_res.allocate(NN); + int* d_A_ptr = device_res.allocate(NN); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running device version of teams_flatten example ...\n"; @@ -97,7 +98,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_A_2DView(d_A_ptr, N, N); RAJA::View> d_A_1DView(d_A_ptr, NN); -// clang-format off + // clang-format off RAJA::launch (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -118,8 +119,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on -//----------------------------------------------------------------------------// + // clang-format on + //----------------------------------------------------------------------------// std::cout << "\n Running host version of teams_flatten example ...\n"; @@ -127,7 +128,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> h_A_2DView(h_A_ptr, N, N); RAJA::View> h_A_1DView(h_A_ptr, NN); -// clang-format off + // clang-format off RAJA::launch (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -148,10 +149,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on - if ( device_kernel_sum.get() == host_kernel_sum.get() ) { + // clang-format on + if (device_kernel_sum.get() == host_kernel_sum.get()) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp index bccf7ccd87..4324b8ba31 100644 --- a/examples/launch_matrix-multiply.cpp +++ b/examples/launch_matrix-multiply.cpp @@ -31,7 +31,7 @@ /* * Define number of threads in x and y dimensions in a RAJA thread team * or in a CUDA/HIP thread blocks -*/ + */ #define THREAD_SZ 16 /* @@ -54,22 +54,22 @@ using launch_policy = RAJA::LaunchPolicy< using loop_policy = RAJA::seq_exec; #if defined(RAJA_ENABLE_CUDA) -using gpu_block_x_policy = RAJA::cuda_block_x_direct; -using gpu_block_y_policy = RAJA::cuda_block_y_direct; -using gpu_thread_x_policy = RAJA::cuda_thread_x_loop; -using gpu_thread_y_policy = RAJA::cuda_thread_y_loop; -using gpu_global_thread_x_policy = RAJA::cuda_global_thread_x; -using gpu_global_thread_y_policy = RAJA::cuda_global_thread_y; +using gpu_block_x_policy = RAJA::cuda_block_x_direct; +using gpu_block_y_policy = RAJA::cuda_block_y_direct; +using gpu_thread_x_policy = RAJA::cuda_thread_x_loop; +using gpu_thread_y_policy = RAJA::cuda_thread_y_loop; +using gpu_global_thread_x_policy = RAJA::cuda_global_thread_x; +using gpu_global_thread_y_policy = RAJA::cuda_global_thread_y; using gpu_global_thread_xy_policy = RAJA::cuda_global_thread_xy; #endif #if defined(RAJA_ENABLE_HIP) -using gpu_block_x_policy = RAJA::hip_block_x_direct; -using gpu_block_y_policy = RAJA::hip_block_y_direct; -using gpu_thread_x_policy = RAJA::hip_thread_x_loop; -using gpu_thread_y_policy = RAJA::hip_thread_y_loop; -using gpu_global_thread_x_policy = RAJA::hip_global_thread_x; -using gpu_global_thread_y_policy = RAJA::hip_global_thread_y; +using gpu_block_x_policy = RAJA::hip_block_x_direct; +using gpu_block_y_policy = RAJA::hip_block_y_direct; +using gpu_thread_x_policy = RAJA::hip_thread_x_loop; +using gpu_thread_y_policy = RAJA::hip_thread_y_loop; +using gpu_global_thread_x_policy = RAJA::hip_global_thread_x; +using gpu_global_thread_y_policy = RAJA::hip_global_thread_y; using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy; #endif @@ -79,39 +79,45 @@ using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy; */ using teams_x = RAJA::LoopPolicy; + >; using teams_y = RAJA::LoopPolicy; + >; using threads_x = RAJA::LoopPolicy; + >; using threads_y = RAJA::LoopPolicy; + >; using global_thread_x = RAJA::LoopPolicy; + >; using global_thread_y = RAJA::LoopPolicy; + >; // // Define dimensionality of matrices. @@ -136,9 +142,11 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; - if ( row < N && col < N ) { + if (row < N && col < N) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } @@ -149,8 +157,8 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) { - int Row = blockIdx.y*THREAD_SZ + threadIdx.y; - int Col = blockIdx.x*THREAD_SZ + threadIdx.x; + int Row = blockIdx.y * THREAD_SZ + threadIdx.y; + int Col = blockIdx.x * THREAD_SZ + threadIdx.x; __shared__ double As[THREAD_SZ][THREAD_SZ]; __shared__ double Bs[THREAD_SZ][THREAD_SZ]; @@ -158,15 +166,16 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) Cs[threadIdx.y][threadIdx.x] = 0.0; - for (int k = 0; k < (THREAD_SZ + N - 1)/THREAD_SZ; k++) { + for (int k = 0; k < (THREAD_SZ + N - 1) / THREAD_SZ; k++) + { - if ( static_cast(k*THREAD_SZ + threadIdx.x) < N && Row < N ) - As[threadIdx.y][threadIdx.x] = A[Row*N + k*THREAD_SZ + threadIdx.x]; + if (static_cast(k * THREAD_SZ + threadIdx.x) < N && Row < N) + As[threadIdx.y][threadIdx.x] = A[Row * N + k * THREAD_SZ + threadIdx.x]; else As[threadIdx.y][threadIdx.x] = 0.0; - if ( static_cast(k*THREAD_SZ + threadIdx.y) < N && Col < N) - Bs[threadIdx.y][threadIdx.x] = B[(k*THREAD_SZ + threadIdx.y)*N + Col]; + if (static_cast(k * THREAD_SZ + threadIdx.y) < N && Col < N) + Bs[threadIdx.y][threadIdx.x] = B[(k * THREAD_SZ + threadIdx.y) * N + Col]; else Bs[threadIdx.y][threadIdx.x] = 0.0; @@ -179,8 +188,8 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) } if (Row < N && Col < N) - C[((blockIdx.y * blockDim.y + threadIdx.y)*N) + - (blockIdx.x * blockDim.x)+ threadIdx.x] = Cs[threadIdx.y][threadIdx.x]; + C[((blockIdx.y * blockDim.y + threadIdx.y) * N) + + (blockIdx.x * blockDim.x) + threadIdx.x] = Cs[threadIdx.y][threadIdx.x]; } #endif @@ -188,7 +197,7 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) // Functions for checking results // template -void checkResult(T *C, int N); +void checkResult(T* C, int N); // clang-format off template @@ -199,7 +208,7 @@ void checkResult(RAJA::View> Cview, int N); // Functions for printing results // template -void printResult(T *C, int N); +void printResult(T* C, int N); // clang-format off template @@ -207,62 +216,66 @@ void printResult(RAJA::View> Cview, int N); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix multiplication example...\n"; -// -// Define num rows/cols in matrix and number of teams based on -// number of threads in a dimension. -// - const int N = 1000; - const int NTeams = (N - 1)/THREAD_SZ + 1; - -// -// Allocate and initialize matrix data. -// - double *A = memoryManager::allocate(N * N); - double *B = memoryManager::allocate(N * N); - double *C = memoryManager::allocate(N * N); + // + // Define num rows/cols in matrix and number of teams based on + // number of threads in a dimension. + // + const int N = 1000; + const int NTeams = (N - 1) / THREAD_SZ + 1; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + // + // Allocate and initialize matrix data. + // + double* A = memoryManager::allocate(N * N); + double* B = memoryManager::allocate(N * N); + double* C = memoryManager::allocate(N * N); + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { A(row, col) = row; B(row, col) = col; } } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix multiplication...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_cstyle_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } C(row, col) = dot; - } } // _matmult_cstyle_end checkResult(C, N); -//printResult(C, N); + // printResult(C, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// We define RAJA range segments to define the ranges of -// row, column, and dot-product loops for RAJA variants -// + // + // We define RAJA range segments to define the ranges of + // row, column, and dot-product loops for RAJA variants + // // _matmult_ranges_start RAJA::RangeSegment row_range(0, N); RAJA::RangeSegment col_range(0, N); @@ -271,56 +284,59 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif // _matmult_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// For the RAJA implementations of matrix multiplication, we -// use RAJA 'View' objects to access the matrix data. A RAJA view -// holds a pointer to a data array and enables multi-dimensional indexing -// into that data, similar to the macros we defined above. -// + // + // For the RAJA implementations of matrix multiplication, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into that data, similar to the macros we defined above. + // // _matmult_views_start RAJA::View> Aview(A, N, N); RAJA::View> Bview(B, N, N); RAJA::View> Cview(C, N, N); // _matmult_views_end -//----------------------------------------------------------------------------// -// -// RAJA Team loops uses a RAJA::launch method to launch a kernel. -// These examples, illustrate the basic interface and mechanics. -// -// This is different than RAJA::forall and so a few points of exmplanation -// are in order: -// -// 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch) -// execution is chosen at run time and we support running on the host -// or device. -// -// 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP -// and considers programming using a group of threads in which we group into -// teams. Number of threads and teams are defined inside the Resources struct. -// -// 3) Launch context is used synchronize threads within a team, an example of this -// is presented further below. -// -// 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism can be -// expressed by mapping outer loops (up to 3) to gpu blocks (teams) and inner -// loops to threads in a block (team). -// + //----------------------------------------------------------------------------// + // + // RAJA Team loops uses a RAJA::launch method to launch a kernel. + // These examples, illustrate the basic interface and mechanics. + // + // This is different than RAJA::forall and so a few points of exmplanation + // are in order: + // + // 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch) + // execution is chosen at run time and we support running on the host + // or device. + // + // 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP + // and considers programming using a group of threads in which we group + // into teams. Number of threads and teams are defined inside the Resources + // struct. + // + // 3) Launch context is used synchronize threads within a team, an example of + // this + // is presented further below. + // + // 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism + // can be + // expressed by mapping outer loops (up to 3) to gpu blocks (teams) and + // inner loops to threads in a block (team). + // std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - //As a starting point we demonstrate assigning each dot product - //to a thread on a two dimensional compute grid. Rows are mapped - //to threads in the x dimension, while Cols are mapped to threads - //in the y dimension. On the host this mapping simplifies to executing - //two for loops. + // As a starting point we demonstrate assigning each dot product + // to a thread on a two dimensional compute grid. Rows are mapped + // to threads in the x dimension, while Cols are mapped to threads + // in the y dimension. On the host this mapping simplifies to executing + // two for loops. // _matmult_basickernel_start -// clang-format off + // clang-format off RAJA::launch(RAJA::ExecPlace::HOST, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -339,31 +355,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_basickernel_end -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - //RAJA Team loops currently only support a pair of policies at a time. - //Switching between a sequential and OpenMP launch space requires - //recompiling execution policies. When running exclusively on the host - //the compute grid may be left uninitialized as loop methods get expanded to - //standard C style loops. + // RAJA Team loops currently only support a pair of policies at a time. + // Switching between a sequential and OpenMP launch space requires + // recompiling execution policies. When running exclusively on the host + // the compute grid may be left uninitialized as loop methods get expanded to + // standard C style loops. using omp_launch_policy = RAJA::LaunchPolicy; using omp_col_policy0 = RAJA::LoopPolicy; using omp_row_policy0 = RAJA::LoopPolicy; -// clang-format off + // clang-format off RAJA::launch(RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -380,15 +396,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This example collapses the row and col loops in an OpenMP parallel region. @@ -397,7 +413,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using global_thread_xy = RAJA::LoopPolicy; -// clang-format off + // clang-format off RAJA::launch(RAJA::ExecPlace::HOST, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -414,18 +430,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); -#endif // if RAJA_ENABLE_OPENMP +// printResult(Cview, N); +#endif // if RAJA_ENABLE_OPENMP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA mat-mult (RAJA-nested)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This example maps row indicies to RAJA teams (CUDA @@ -437,7 +453,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // -// clang-format off + // clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)), @@ -456,26 +472,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tiled mat-mult ...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This example takes the extents of the col and row loops and breaks // them down into `tiles`. Tile loops are used to generate RangeSegments of // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate - // across the work within each tile. On the device, tiles are typically assigned - // to teams, while RAJA loops are mapped to threads. + // across the work within each tile. On the device, tiles are typically + // assigned to teams, while RAJA loops are mapped to threads. // // The tiling capabilities in RAJA will also mask out of bounds iterations. // -// clang-format off + // clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -501,27 +517,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -#endif // if RAJA_ENABLE_CUDA +#endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) - double *d_A = memoryManager::allocate_gpu(N * N); - double *d_B = memoryManager::allocate_gpu(N * N); - double *d_C = memoryManager::allocate_gpu(N * N); + double* d_A = memoryManager::allocate_gpu(N * N); + double* d_B = memoryManager::allocate_gpu(N * N); + double* d_C = memoryManager::allocate_gpu(N * N); std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); RAJA::View> d_Aview(d_A, N, N); RAJA::View> d_Bview(d_B, N, N); @@ -537,7 +553,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // -// clang-format off + // clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)), @@ -557,29 +573,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); -// clang-format on - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP tiled mat-mult ...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // // This example takes the extents of the col and row loops and breaks // them down into `tiles`. Tile loops are used to generate RangeSegments of // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate - // across the work within each tile. On the device tiles are typically assigned - // to teams, while RAJA loops are mapped to threads. + // across the work within each tile. On the device tiles are typically + // assigned to teams, while RAJA loops are mapped to threads. // // The tiling capabilities in RAJA will also mask out of bounds iterations. // -// clang-format off + // clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -605,20 +621,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); -// clang-format on - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); -#endif // if RAJA_ENABLE_HIP +// printResult(Cview, N); +#endif // if RAJA_ENABLE_HIP //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA tiled mat-mult with shared memory ...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - using seq_loop = RAJA::LoopPolicy; + using seq_loop = RAJA::LoopPolicy; // // This example builds on the RAJA tiling capabilies presented earlier @@ -630,7 +646,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This example also uses the teamSync() method in the launch context // to add a barrier ensuring all threads have loaded/read from shared memory // -// clang-format off + // clang-format off RAJA::launch(RAJA::ExecPlace::DEVICE, RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), RAJA::Threads(THREAD_SZ,THREAD_SZ)), @@ -693,24 +709,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // kernel -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // Define thread block dimensions dim3 blockdim(THREAD_SZ, THREAD_SZ); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); // Launch CUDA kernel defined near the top of this file. matMultKernel<<>>(N, C, A, B); @@ -719,20 +736,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Cview, N); - std::cout << "\n Running CUDA tiled mat-mult with shared memory (no RAJA)...\n"; + std::cout + << "\n Running CUDA tiled mat-mult with shared memory (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); sharedMatMultKernel<<>>(N, C, A, B); cudaDeviceSynchronize(); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -#endif // if RAJA_ENABLE_CUDA +#endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -741,47 +759,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define thread block dimensions dim3 blockdim(THREAD_SZ, THREAD_SZ); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // Launch HIP kernel defined near the top of this file. - hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); + hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, + d_C, d_A, d_B); hipDeviceSynchronize(); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); - std::cout << "\n Running HIP tiled mat-mult with shared memory (no RAJA)...\n"; + std::cout + << "\n Running HIP tiled mat-mult with shared memory (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // Launch HIP kernel defined near the top of this file. - hipLaunchKernelGGL((sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); + hipLaunchKernelGGL((sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0, + N, d_C, d_A, d_B); hipDeviceSynchronize(); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); memoryManager::deallocate_gpu(d_A); memoryManager::deallocate_gpu(d_B); memoryManager::deallocate_gpu(d_C); -#endif // if RAJA_ENABLE_HIP +#endif // if RAJA_ENABLE_HIP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp index ba611f7782..57833fe232 100644 --- a/examples/launch_reductions.cpp +++ b/examples/launch_reductions.cpp @@ -29,31 +29,33 @@ #if defined(RAJA_ENABLE_OPENMP) using host_launch = RAJA::omp_launch_t; -using host_loop = RAJA::omp_for_exec; +using host_loop = RAJA::omp_for_exec; #else -using host_launch = RAJA::seq_launch_t; -using host_loop = RAJA::seq_exec; +using host_launch = RAJA::seq_launch_t; +using host_loop = RAJA::seq_exec; #endif #if defined(RAJA_ENABLE_CUDA) using device_launch = RAJA::cuda_launch_t; -using device_loop = RAJA::cuda_global_thread_x; +using device_loop = RAJA::cuda_global_thread_x; #elif defined(RAJA_ENABLE_HIP) using device_launch = RAJA::hip_launch_t; -using device_loop = RAJA::hip_global_thread_x; +using device_loop = RAJA::hip_global_thread_x; #endif using launch_policy = RAJA::LaunchPolicy; + >; using loop_pol = RAJA::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) using reduce_policy = RAJA::cuda_reduce; @@ -66,11 +68,13 @@ using reduce_policy = RAJA::seq_reduce; #endif -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW( + "Usage ./launch_reductions host or ./launch_reductions device"); } // @@ -79,77 +83,93 @@ int main(int argc, char *argv[]) // Example usage ./launch_reductions host or ./launch_reductions device // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW( + "Usage ./launch_reductions host or ./launch_reductions device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Launch reductions example on the host \n"); } - if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Launch reductions example on the device \n"); } + if (exec_space.compare("host") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::HOST; + printf("Running RAJA-Launch reductions example on the host \n"); + } + if (exec_space.compare("device") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; + printf("Running RAJA-Launch reductions example on the device \n"); + } // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // const int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // const int minloc_ref = N / 2; - a[minloc_ref] = -100; + a[minloc_ref] = -100; const int maxloc_ref = N / 2 + 1; - a[maxloc_ref] = 100; + a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // -// -// Define index range for iterating over a elements in all examples -// + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::RangeSegment arange(0, N); // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// RAJA::ReduceSum kernel_sum(0); - RAJA::ReduceMin kernel_min(std::numeric_limits::max()); - RAJA::ReduceMax kernel_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc kernel_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc kernel_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMin kernel_min( + std::numeric_limits::max()); + RAJA::ReduceMax kernel_max( + std::numeric_limits::min()); + RAJA::ReduceMinLoc kernel_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc kernel_maxloc( + std::numeric_limits::min(), -1); const int TEAM_SZ = 256; - const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ); + const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ); -// clang-format off + // clang-format off RAJA::launch (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), @@ -171,20 +191,20 @@ int main(int argc, char *argv[]) }); -// clang-format on + // clang-format on std::cout << "\tsum = " << kernel_sum.get() << std::endl; std::cout << "\tmin = " << kernel_min.get() << std::endl; std::cout << "\tmax = " << kernel_max.get() << std::endl; std::cout << "\tmin, loc = " << kernel_minloc.get() << " , " - << kernel_minloc.getLoc() << std::endl; + << kernel_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , " - << kernel_maxloc.getLoc() << std::endl; + << kernel_maxloc.getLoc() << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp index 960142a83b..a6f2ed8384 100644 --- a/examples/memoryManager.hpp +++ b/examples/memoryManager.hpp @@ -28,7 +28,7 @@ namespace memoryManager { #if defined(RAJA_ENABLE_SYCL) - static camp::resources::Resource* sycl_res; +static camp::resources::Resource* sycl_res; #endif // clang-format off diff --git a/examples/multiview.cpp b/examples/multiview.cpp index b765dc84d4..c9672a7a0f 100644 --- a/examples/multiview.cpp +++ b/examples/multiview.cpp @@ -15,12 +15,12 @@ * A RAJA::MultiView object wraps an array-of-pointers, * or a pointer-to-pointers, whereas a RAJA::View wraps a single * pointer or array. This allows a single RAJA::Layout to be applied to - * multiple arrays internal to the MultiView, allowing multiple arrays to share indexing - * arithmetic when their access patterns are the same. - * + * multiple arrays internal to the MultiView, allowing multiple arrays to share + * indexing arithmetic when their access patterns are the same. + * * The instantiation of a MultiView works exactly like a standard View, - * except that it takes an array-of-pointers. In the following example, a MultiView - * applies a 1-D layout of length 4 to 2 internal arrays in myarr: + * except that it takes an array-of-pointers. In the following example, a + * MultiView applies a 1-D layout of length 4 to 2 internal arrays in myarr: * * // Arrays of the same size, which will become internal to the MultiView. * int a1[4] = {5,6,7,8}; @@ -31,30 +31,35 @@ * myarr[0] = a1; * myarr[1] = a2; * - * // This MultiView applies a 1-D layout of length 4 to each internal array in myarr. - * RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); - * - * The default MultiView accesses internal arrays via the 0th index of the MultiView: - * - * MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - * MView( 1, 2 ); // accesses 2nd index of the 1st internal array a2, returns value of 10 - * + * // This MultiView applies a 1-D layout of length 4 to each internal array + * in myarr. RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); + * + * The default MultiView accesses internal arrays via the 0th index of the + * MultiView: + * + * MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, + * returns value of 8 MView( 1, 2 ); // accesses 2nd index of the 1st internal + * array a2, returns value of 10 + * * The index into the array-of-pointers can be moved to different - * indices of the MultiView () access operator, rather than the default 0th index. By - * passing a third template parameter to the MultiView constructor, the internal array index - * and the integer indicating which array to access can be reversed: + * indices of the MultiView () access operator, rather than the default 0th + * index. By passing a third template parameter to the MultiView constructor, + * the internal array index and the integer indicating which array to access can + * be reversed: * * // MultiView with array-of-pointers index in 1st position * RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4); * - * MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - * MView1( 2, 1 ); // accesses 2nd index of the 1st internal array a2, returns value of 10 - * - * As the number of Layout dimensions increases, the index into the array-of-pointers can be - * moved to more distinct locations in the MultiView () access operator. Here is an example - * which compares the accesses of a 2-D layout on a normal RAJA::View with a RAJA::MultiView - * with the array-of-pointers index set to the 2nd position: - * + * MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, + * returns value of 8 MView1( 2, 1 ); // accesses 2nd index of the 1st internal + * array a2, returns value of 10 + * + * As the number of Layout dimensions increases, the index into the + * array-of-pointers can be moved to more distinct locations in the MultiView () + * access operator. Here is an example which compares the accesses of a 2-D + * layout on a normal RAJA::View with a RAJA::MultiView with the + * array-of-pointers index set to the 2nd position: + * * RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2); * * normalView( 2, 1 ); // accesses 3rd index of the a1 array, value = 7 @@ -62,8 +67,9 @@ * // MultiView with array-of-pointers index in 2nd position * RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2); * - * MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, returns value of 7 (same as normaView(2,1)) - * MView2( 2, 1, 1 ); // accesses the 3rd index of the 1st internal array a2, returns value of 11 + * MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, + * returns value of 7 (same as normaView(2,1)) MView2( 2, 1, 1 ); // accesses + * the 3rd index of the 1st internal array a2, returns value of 11 * * The following code demonstrates 2 aspects of RAJA::MultiView usage: * - Basic usage @@ -75,53 +81,62 @@ void docs_example() // temporaries int t1, t2, t3, t4; - printf( "MultiView Example from RAJA Documentation:\n" ); + printf("MultiView Example from RAJA Documentation:\n"); // _multiview_example_1Dinit_start // Arrays of the same size, which will become internal to the MultiView. - int a1[4] = {5,6,7,8}; - int a2[4] = {9,10,11,12}; + int a1[4] = {5, 6, 7, 8}; + int a2[4] = {9, 10, 11, 12}; // Array-of-pointers which will be passed into MultiView. - int * myarr[2]; + int* myarr[2]; myarr[0] = a1; myarr[1] = a2; - // This MultiView applies a 1-D layout of length 4 to each internal array in myarr. - RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); + // This MultiView applies a 1-D layout of length 4 to each internal array in + // myarr. + RAJA::MultiView> MView(myarr, 4); // _multiview_example_1Dinit_end // _multiview_example_1Daccess_start - t1 = MView( 0, 3 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - t2 = MView( 1, 2 ); // accesses 3rd index of the 1st internal array a2, returns value of 11 + t1 = MView(0, 3); // accesses the 4th index of the 0th internal array a1, + // returns value of 8 + t2 = MView(1, 2); // accesses 3rd index of the 1st internal array a2, returns + // value of 11 // _multiview_example_1Daccess_end // _multiview_example_1Daopindex_start // MultiView with array-of-pointers index in 1st position. - RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4); + RAJA::MultiView, 1> MView1(myarr, 4); - t3 = MView1( 3, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - t4 = MView1( 2, 1 ); // accesses 3rd index of the 1st internal array a2, returns value of 11 + t3 = MView1(3, 0); // accesses the 4th index of the 0th internal array a1, + // returns value of 8 + t4 = MView1(2, 1); // accesses 3rd index of the 1st internal array a2, + // returns value of 11 // _multiview_example_1Daopindex_end - printf( "Comparison of default MultiView with another MultiView that has the array-of-pointers index in the 1st position of the () accessor:\n" ); - printf( "MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3 ); - printf( "MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4 ); + printf("Comparison of default MultiView with another MultiView that has the " + "array-of-pointers index in the 1st position of the () accessor:\n"); + printf("MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3); + printf("MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4); // _multiview_example_2Daopindex_start - RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2); + RAJA::View> normalView(a1, 2, 2); - t1 = normalView( 1, 1 ); // accesses 4th index of the a1 array, value = 8 + t1 = normalView(1, 1); // accesses 4th index of the a1 array, value = 8 // MultiView with array-of-pointers index in 2nd position - RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2); + RAJA::MultiView, 2> MView2(myarr, 2, 2); - t2 = MView2( 1, 1, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 (same as normalView(1,1)) - t3 = MView2( 0, 0, 1 ); // accesses the 1st index of the 1st internal array a2, returns value of 9 + t2 = MView2(1, 1, 0); // accesses the 4th index of the 0th internal array a1, + // returns value of 8 (same as normalView(1,1)) + t3 = MView2(0, 0, 1); // accesses the 1st index of the 1st internal array a2, + // returns value of 9 // _multiview_example_2Daopindex_end - printf( "Comparison of 2D normal View with 2D MultiView that has the array-of-pointers index in the 2nd position of the () accessor:\n" ); - printf( "normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2 ); + printf("Comparison of 2D normal View with 2D MultiView that has the " + "array-of-pointers index in the 2nd position of the () accessor:\n"); + printf("normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2); } int main() @@ -129,11 +144,11 @@ int main() docs_example(); constexpr int N = 12; - int * myarr[2]; // two 3x4 arrays + int* myarr[2]; // two 3x4 arrays int arr1[N]; int arr2[N]; - for ( int ii = 0; ii < N; ++ii ) + for (int ii = 0; ii < N; ++ii) { arr1[ii] = 100 + ii; arr2[ii] = 200 + ii; @@ -143,55 +158,58 @@ int main() myarr[1] = arr2; // 4x3 layout - std::array perm { {0, 1} }; - RAJA::Layout<2> layout = RAJA::make_permuted_layout( - { {4, 3} }, perm - ); + std::array perm {{0, 1}}; + RAJA::Layout<2> layout = RAJA::make_permuted_layout({{4, 3}}, perm); // Basic MultiView usage // Default usage: no specified array-of-pointers index moving // 0th position is used as the array-of-pointers index - RAJA::MultiView> arrView(myarr, layout); + RAJA::MultiView> arrView(myarr, + layout); // Moved array-of-pointers index MultiView usage // Add an array-of-pointers index specifier constexpr int aopidx = 1; - RAJA::MultiView, aopidx> arrViewMov(myarr, layout); + RAJA::MultiView, aopidx> arrViewMov( + myarr, layout); // Comparing values of both views - printf ( "Comparing values of both default and 1-index-ed MultiViews:\n" ); - for ( int pp = 0; pp < 2; ++pp ) + printf("Comparing values of both default and 1-index-ed MultiViews:\n"); + for (int pp = 0; pp < 2; ++pp) { - for ( int kk = 0; kk < 4; ++kk ) + for (int kk = 0; kk < 4; ++kk) { - for ( int jj = 0; jj < 3; ++jj ) + for (int jj = 0; jj < 3; ++jj) { - printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) ); + printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, + arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj)); } } } // switch values - printf ( "Switching values\n" ); - for ( int kk = 0; kk < 4; ++kk ) + printf("Switching values\n"); + for (int kk = 0; kk < 4; ++kk) { - for ( int jj = 0; jj < 3; ++jj ) + for (int jj = 0; jj < 3; ++jj) { - int temp = arrView(0, kk, jj); + int temp = arrView(0, kk, jj); arrView(0, kk, jj) = arrView(1, kk, jj); arrView(1, kk, jj) = temp; } } // Comparing switched values of both views - printf ( "Comparing switched values of both default and 1-index-ed MultiViews:\n" ); - for ( int pp = 0; pp < 2; ++pp ) + printf( + "Comparing switched values of both default and 1-index-ed MultiViews:\n"); + for (int pp = 0; pp < 2; ++pp) { - for ( int kk = 0; kk < 4; ++kk ) + for (int kk = 0; kk < 4; ++kk) { - for ( int jj = 0; jj < 3; ++jj ) + for (int jj = 0; jj < 3; ++jj) { - printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) ); + printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, + arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj)); } } } diff --git a/examples/omp-target-kernel.cpp b/examples/omp-target-kernel.cpp index 8351d7f388..d49b271574 100644 --- a/examples/omp-target-kernel.cpp +++ b/examples/omp-target-kernel.cpp @@ -10,40 +10,40 @@ using namespace RAJA; using namespace RAJA::statement; -int main(int /*argc*/, char** /*argv[]*/) { +int main(int /*argc*/, char** /*argv[]*/) +{ -// clang-format off + // clang-format off // using Pol = KernelPolicy< // For<1, RAJA::seq_exec>, // For<0, RAJA::omp_target_parallel_for_exec<1>, Lambda<0> > // >; using Pol = KernelPolicy< -// clang-format on - Collapse, Lambda<0> > >; + // clang-format on + Collapse, Lambda<0>>>; -// clang-format on - double* array = new double[25*25]; + // clang-format on + double* array = new double[25 * 25]; -#pragma omp target enter data map(to: array[0:25*25]) +#pragma omp target enter data map(to : array [0:25 * 25]) #pragma omp target data use_device_ptr(array) #if 1 RAJA::kernel( - RAJA::make_tuple( - RAJA::RangeSegment(0,25), - RAJA::RangeSegment(0,25)), - [=] (int /*i*/, int /*j*/) { - //array[i + (25*j)] = i*j; - // int idx = i; - //array[0] = i*j; - }); + RAJA::make_tuple(RAJA::RangeSegment(0, 25), RAJA::RangeSegment(0, 25)), + [=](int /*i*/, int /*j*/) + { + // array[i + (25*j)] = i*j; + // int idx = i; + // array[0] = i*j; + }); #else -// clang-format off + // clang-format off RAJA::forall>( RAJA::RangeSegment(0,25), [=] (int i) { // }); #endif -// clang-format on + // clang-format on } diff --git a/examples/omp-target-ltimes.cpp b/examples/omp-target-ltimes.cpp index f51694b3af..1d2f43924b 100644 --- a/examples/omp-target-ltimes.cpp +++ b/examples/omp-target-ltimes.cpp @@ -9,7 +9,6 @@ #include - #include "RAJA/RAJA.hpp" #include "RAJA/util/Timer.hpp" @@ -28,22 +27,25 @@ RAJA_INDEX_VALUE(IZone, "IZone"); void runLTimesRajaKernel(bool debug, - Index_type num_moments, - Index_type num_directions, - Index_type num_groups, - Index_type num_zones) + Index_type num_moments, + Index_type num_directions, + Index_type num_groups, + Index_type num_zones) { - using namespace RAJA::statement; + using namespace RAJA::statement; // psi[direction, group, zone] - using PsiView = RAJA::TypedView, IDirection, IGroup, IZone>; + using PsiView = RAJA::TypedView, IDirection, + IGroup, IZone>; // phi[moment, group, zone] - using PhiView = RAJA::TypedView, IMoment, IGroup, IZone>; + using PhiView = + RAJA::TypedView, IMoment, IGroup, IZone>; // ell[moment, direction] - using EllView = RAJA::TypedView, IMoment, IDirection>; + using EllView = + RAJA::TypedView, IMoment, IDirection>; // allocate data @@ -54,16 +56,19 @@ void runLTimesRajaKernel(bool debug, // randomize data - for (size_t i = 0; i < ell_data.size(); ++i) { - ell_data[i] = i; //drand48(); + for (size_t i = 0; i < ell_data.size(); ++i) + { + ell_data[i] = i; // drand48(); } - for (size_t i = 0; i < psi_data.size(); ++i) { - psi_data[i] = 2*i; //drand48(); + for (size_t i = 0; i < psi_data.size(); ++i) + { + psi_data[i] = 2 * i; // drand48(); } - for (size_t i = 0; i < phi_data.size(); ++i) { - phi_data[i] = 0; //drand48(); + for (size_t i = 0; i < phi_data.size(); ++i) + { + phi_data[i] = 0; // drand48(); } int hid = omp_get_initial_device(); @@ -71,58 +76,48 @@ void runLTimesRajaKernel(bool debug, // create device memory double *d_ell, *d_phi, *d_psi; - d_ell = static_cast(omp_target_alloc(sizeof(double) * ell_data.size(), did)); - d_phi = static_cast(omp_target_alloc(sizeof(double) * phi_data.size(), did)); - d_psi = static_cast(omp_target_alloc(sizeof(double) * psi_data.size(), did)); + d_ell = static_cast( + omp_target_alloc(sizeof(double) * ell_data.size(), did)); + d_phi = static_cast( + omp_target_alloc(sizeof(double) * phi_data.size(), did)); + d_psi = static_cast( + omp_target_alloc(sizeof(double) * psi_data.size(), did)); // Copy to device - omp_target_memcpy( - &ell_data[0], - d_ell, - sizeof(double) * ell_data.size(), - 0,0, hid, did); - omp_target_memcpy( - &phi_data[0], - d_phi, - sizeof(double) * phi_data.size(), - 0,0,hid,did); - omp_target_memcpy( - &psi_data[0], - d_psi, - sizeof(double) * psi_data.size(), - 0,0,hid,did); + omp_target_memcpy(&ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0, + hid, did); + omp_target_memcpy(&phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0, + hid, did); + omp_target_memcpy(&psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0, + hid, did); // create views on data std::array ell_perm {{0, 1}}; - EllView ell( - d_ell, - make_permuted_layout({{num_moments, num_directions}}, ell_perm)); + EllView ell(d_ell, + make_permuted_layout({{num_moments, num_directions}}, ell_perm)); std::array psi_perm {{0, 1, 2}}; - PsiView psi( - d_psi, - make_permuted_layout({{num_directions, num_groups, num_zones}}, psi_perm)); + PsiView psi(d_psi, make_permuted_layout( + {{num_directions, num_groups, num_zones}}, psi_perm)); std::array phi_perm {{0, 1, 2}}; - PhiView phi( - d_phi, - make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm)); - + PhiView phi(d_phi, make_permuted_layout( + {{num_moments, num_groups, num_zones}}, phi_perm)); using Pol = RAJA::KernelPolicy< - Collapse, - For<3, RAJA::seq_exec, Lambda<0>>>>; + Collapse, + For<3, RAJA::seq_exec, Lambda<0>>>>; RAJA::Timer timer; timer.start(); - auto segments = RAJA::make_tuple(TypedRangeSegment(0, num_moments), - TypedRangeSegment(0, num_directions), - TypedRangeSegment(0, num_groups), - TypedRangeSegment(0, num_zones)); + auto segments = + RAJA::make_tuple(TypedRangeSegment(0, num_moments), + TypedRangeSegment(0, num_directions), + TypedRangeSegment(0, num_groups), + TypedRangeSegment(0, num_zones)); kernel( @@ -130,56 +125,61 @@ void runLTimesRajaKernel(bool debug, segments, // Lambda_CalcPhi - [=] (IMoment m, IDirection d, IGroup g, IZone z) { - phi(m, g, z) += ell(m, d) * psi(d, g, z); - }); - + [=](IMoment m, IDirection d, IGroup g, IZone z) + { phi(m, g, z) += ell(m, d) * psi(d, g, z); }); timer.stop(); - printf("LTimes took %lf seconds using RAJA::kernel\n", - timer.elapsed()); + printf("LTimes took %lf seconds using RAJA::kernel\n", timer.elapsed()); // Check correctness - if(debug){ + if (debug) + { - size_t errors = 0; + size_t errors = 0; double total_error = 0.; - for (IZone z(0); z < num_zones; ++z) { - for (IGroup g(0); g < num_groups; ++g) { - for (IMoment m(0); m < num_moments; ++m) { + for (IZone z(0); z < num_zones; ++z) + { + for (IGroup g(0); g < num_groups; ++g) + { + for (IMoment m(0); m < num_moments; ++m) + { double total = 0.0; - for (IDirection d(0); d < num_directions; ++d) { + for (IDirection d(0); d < num_directions; ++d) + { double val = ell(m, d) * psi(d, g, z); total += val; } - if(std::abs(total-phi(m,g,z)) > 1e-9){ - ++ errors; + if (std::abs(total - phi(m, g, z)) > 1e-9) + { + ++errors; } - total_error += std::abs(total-phi(m,g,z)); + total_error += std::abs(total - phi(m, g, z)); } } } - if(errors == 0){ + if (errors == 0) + { printf(" -- no errors (%e)\n", total_error); } - else{ + else + { printf(" -- failed : %ld errors\n", (long)errors); } } - } -int main(){ +int main() +{ bool debug = true; int m = 25; int d = 80; int g = 32; - int z = 32*1024; + int z = 32 * 1024; printf("m=%d, d=%d, g=%d, z=%d\n", m, d, g, z); @@ -187,5 +187,3 @@ int main(){ return 0; } - - diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp index 9531bce111..a4a2b4fa91 100644 --- a/examples/pi-reduce_vs_atomic.cpp +++ b/examples/pi-reduce_vs_atomic.cpp @@ -45,55 +45,56 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA pi example...\n"; -// -// Define RangeSegment to enumerate "bins" and "bin step" size used in -// Riemann integral sum to approximate pi, -// and memory location for atomic add operation. -// + // + // Define RangeSegment to enumerate "bins" and "bin step" size used in + // Riemann integral sum to approximate pi, + // and memory location for atomic add operation. + // const int num_bins = 512 * 512; - const double dx = 1.0 / double(num_bins); + const double dx = 1.0 / double(num_bins); - RAJA::RangeSegment bins(0, num_bins); + RAJA::RangeSegment bins(0, num_bins); double* atomic_pi = memoryManager::allocate(1); -// Set precision for printing pi + // Set precision for printing pi int prec = 16; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential pi approximation...\n"; - + double c_pi = 0.0; - for (int i = 0; i < num_bins; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi += dx / (1.0 + x * x); + for (int i = 0; i < num_bins; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi += dx / (1.0 + x * x); } c_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential pi approximation (reduction)...\n"; using EXEC_POL1 = RAJA::seq_exec; - using REDUCE_POL1 = RAJA::seq_reduce; + using REDUCE_POL1 = RAJA::seq_reduce; RAJA::ReduceSum seq_pi(0.0); - RAJA::forall(bins, [=](int i) { - double x = (double(i) + 0.5) * dx; - seq_pi += dx / (1.0 + x * x); - }); + RAJA::forall(bins, + [=](int i) + { + double x = (double(i) + 0.5) * dx; + seq_pi += dx / (1.0 + x * x); + }); double seq_pi_val = seq_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << seq_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl; std::cout << "\n Running RAJA sequential pi approximation (atomic)...\n"; @@ -102,20 +103,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; -// clang-format off + // clang-format off RAJA::forall(bins, [=](int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; -// clang-format on + // clang-format on - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -126,14 +126,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum omp_pi(0.0); - RAJA::forall(bins, [=](int i) { - double x = (double(i) + 0.5) * dx; - omp_pi += dx / (1.0 + x * x); - }); + RAJA::forall(bins, + [=](int i) + { + double x = (double(i) + 0.5) * dx; + omp_pi += dx / (1.0 + x * x); + }); double omp_pi_val = omp_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << omp_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl; std::cout << "\n Running RAJA OpenMP pi approximation (atomic)...\n"; @@ -142,22 +143,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; -// clang-format off + // clang-format off RAJA::forall(bins, [=](int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; -// clang-format on + // clang-format on - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -168,14 +168,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum cuda_pi(0.0); - RAJA::forall(bins, [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - cuda_pi += dx / (1.0 + x * x); - }); + RAJA::forall(bins, + [=] RAJA_DEVICE(int i) + { + double x = (double(i) + 0.5) * dx; + cuda_pi += dx / (1.0 + x * x); + }); double cuda_pi_val = cuda_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << cuda_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl; std::cout << "\n Running RAJA CUDA pi approximation (atomic)...\n"; @@ -184,20 +185,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; -// clang-format off + // clang-format off RAJA::forall(bins, [=] RAJA_DEVICE (int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; -// clang-format on + // clang-format on - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -208,43 +208,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum hip_pi(0.0); - RAJA::forall(bins, [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - hip_pi += dx / (1.0 + x * x); - }); + RAJA::forall(bins, + [=] RAJA_DEVICE(int i) + { + double x = (double(i) + 0.5) * dx; + hip_pi += dx / (1.0 + x * x); + }); double hip_pi_val = hip_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << hip_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << hip_pi_val << std::endl; std::cout << "\n Running RAJA HIP pi approximation (atomic)...\n"; - *atomic_pi = 0; + *atomic_pi = 0; double* d_atomic_pi = memoryManager::allocate_gpu(1); - hipErrchk(hipMemcpy( d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_atomic_pi, atomic_pi, 1 * sizeof(double), + hipMemcpyHostToDevice)); using ATOMIC_POL4 = RAJA::hip_atomic; -// clang-format off + // clang-format off RAJA::forall(bins, [=] RAJA_DEVICE (int i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(d_atomic_pi, dx / (1.0 + x * x)); }); -// clang-format on - hipErrchk(hipMemcpy( atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost )); - *atomic_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + // clang-format on + hipErrchk(hipMemcpy(atomic_pi, d_atomic_pi, 1 * sizeof(double), + hipMemcpyDeviceToHost)); + *atomic_pi *= 4.0; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; memoryManager::deallocate_gpu(d_atomic_pi); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(atomic_pi); std::cout << "\n DONE!...\n"; diff --git a/examples/plugin/counter-plugin.cpp b/examples/plugin/counter-plugin.cpp index 8134cd9b83..24b7180ebb 100644 --- a/examples/plugin/counter-plugin.cpp +++ b/examples/plugin/counter-plugin.cpp @@ -10,45 +10,48 @@ #include -class CounterPlugin : - public RAJA::util::PluginStrategy +class CounterPlugin : public RAJA::util::PluginStrategy { - public: - void preCapture(const RAJA::util::PluginContext& p) override { - if (p.platform == RAJA::Platform::host) +public: + void preCapture(const RAJA::util::PluginContext& p) override + { + if (p.platform == RAJA::Platform::host) { - std::cout << " [CounterPlugin]: Capturing host kernel for the " << ++host_capture_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Capturing host kernel for the " + << ++host_capture_counter << " time!" << std::endl; } else { - std::cout << " [CounterPlugin]: Capturing device kernel for the " << ++device_capture_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Capturing device kernel for the " + << ++device_capture_counter << " time!" << std::endl; } } - void preLaunch(const RAJA::util::PluginContext& p) override { + void preLaunch(const RAJA::util::PluginContext& p) override + { if (p.platform == RAJA::Platform::host) { - std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_launch_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Launching host kernel for the " + << ++host_launch_counter << " time!" << std::endl; } else { - std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_launch_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Launching device kernel for the " + << ++device_launch_counter << " time!" << std::endl; } } - private: - int host_capture_counter; - int device_capture_counter; - int host_launch_counter; - int device_launch_counter; +private: + int host_capture_counter; + int device_capture_counter; + int host_launch_counter; + int device_launch_counter; }; // Statically loading plugin. -static RAJA::util::PluginRegistry::add P("Counter", "Counts number of kernel launches."); +static RAJA::util::PluginRegistry::add + P("Counter", "Counts number of kernel launches."); // Dynamically loading plugin. -extern "C" RAJA::util::PluginStrategy *getPlugin () -{ - return new CounterPlugin; -} +extern "C" RAJA::util::PluginStrategy* getPlugin() { return new CounterPlugin; } // _plugin_example_end diff --git a/examples/plugin/test-plugin-dynamic.cpp b/examples/plugin/test-plugin-dynamic.cpp index c9e574a99e..b73a13441f 100644 --- a/examples/plugin/test-plugin-dynamic.cpp +++ b/examples/plugin/test-plugin-dynamic.cpp @@ -8,15 +8,14 @@ #include "RAJA/RAJA.hpp" #include -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { RAJA::util::init_plugins("../lib/libtimer_plugin.so"); - double *a = new double[10]; + double* a = new double[10]; for (int i = 0; i < 4; i++) { - RAJA::forall(RAJA::RangeSegment(0, 10), [=](int i) { - a[i] = 0; - }); + RAJA::forall(RAJA::RangeSegment(0, 10), + [=](int i) { a[i] = 0; }); } } diff --git a/examples/plugin/test-plugin.cpp b/examples/plugin/test-plugin.cpp index b18233cb90..2164ae7df9 100644 --- a/examples/plugin/test-plugin.cpp +++ b/examples/plugin/test-plugin.cpp @@ -7,13 +7,13 @@ #include "RAJA/RAJA.hpp" -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { double* a = new double[10]; - for (int i = 0; i < 10; i++) { - RAJA::forall(RAJA::RangeSegment(0,10), [=] (int i) { - a[i] = 0; - }); + for (int i = 0; i < 10; i++) + { + RAJA::forall(RAJA::RangeSegment(0, 10), + [=](int i) { a[i] = 0; }); } } diff --git a/examples/plugin/timer-plugin.cpp b/examples/plugin/timer-plugin.cpp index 2619f9fcd9..2224f1657f 100644 --- a/examples/plugin/timer-plugin.cpp +++ b/examples/plugin/timer-plugin.cpp @@ -21,15 +21,19 @@ class TimerPlugin : public RAJA::util::PluginStrategy void postLaunch(const RAJA::util::PluginContext& p) override { end_time = std::chrono::steady_clock::now(); - double elapsedMs = std::chrono::duration(end_time - start_time).count(); + double elapsedMs = + std::chrono::duration(end_time - start_time) + .count(); if (p.platform == RAJA::Platform::host) { - printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs); + printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", + elapsedMs); } else { - printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", elapsedMs); + printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", + elapsedMs); } } @@ -39,10 +43,8 @@ class TimerPlugin : public RAJA::util::PluginStrategy }; // Dynamically loading plugin. -extern "C" RAJA::util::PluginStrategy *getPlugin() -{ - return new TimerPlugin; -} +extern "C" RAJA::util::PluginStrategy* getPlugin() { return new TimerPlugin; } // Statically loading plugin. -static RAJA::util::PluginRegistry::add P("Timer", "Prints elapsed time of kernel executions."); \ No newline at end of file +static RAJA::util::PluginRegistry::add + P("Timer", "Prints elapsed time of kernel executions."); \ No newline at end of file diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp index 80cc251b92..925256f3ea 100644 --- a/examples/raja-launch.cpp +++ b/examples/raja-launch.cpp @@ -58,36 +58,36 @@ using launch_policy = RAJA::LaunchPolicy< */ using teams_x = RAJA::LoopPolicy< #if defined(RAJA_ENABLE_OPENMP) - RAJA::omp_parallel_for_exec + RAJA::omp_parallel_for_exec #else - RAJA::seq_exec + RAJA::seq_exec #endif #if defined(RAJA_ENABLE_CUDA) - , - RAJA::cuda_block_x_direct + , + RAJA::cuda_block_x_direct #endif #if defined(RAJA_ENABLE_HIP) - , - RAJA::hip_block_x_direct + , + RAJA::hip_block_x_direct #endif - >; + >; /* * Define thread policies. * Up to 3 dimension are supported: x,y,z */ using threads_x = RAJA::LoopPolicy; + >; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { // Resource object for host @@ -111,7 +111,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // RAJA teams may switch between host and device policies at run time. // The loop below will execute through the available backends. - for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) { + for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) + { auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place; @@ -119,12 +120,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int N_tri = 5; int* Ddat = nullptr; - if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { Ddat = host_res.allocate(N_tri * N_tri); } #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { Ddat = device_res.allocate(N_tri * N_tri); } #endif @@ -143,16 +146,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) * and is used to perform thread synchronizations within a team. */ - if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){ - std::cout << "\n Running upper triangular pattern example on the host...\n"; - } else { - std::cout << "\n Running upper triangular pattern example on the device...\n"; + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { + std::cout + << "\n Running upper triangular pattern example on the host...\n"; + } + else + { + std::cout + << "\n Running upper triangular pattern example on the device...\n"; } RAJA::View> D(Ddat, N_tri, N_tri); -// clang-format off + // clang-format off RAJA::launch (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), @@ -178,13 +186,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // outer lambda -// clang-format on - if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + // clang-format on + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { host_res.deallocate(Ddat); } #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { device_res.deallocate(Ddat); } #endif diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp index 4f784b841d..abdd26ff23 100644 --- a/examples/red-black-gauss-seidel.cpp +++ b/examples/red-black-gauss-seidel.cpp @@ -52,7 +52,8 @@ * h - Spacing between grid points * n - Number of grid points */ -struct grid_s { +struct grid_s +{ double o, h; int n; }; @@ -62,16 +63,16 @@ struct grid_s { * solution - Function for the analytic solution * computeErr - Displays the maximum error in the solution * gsColorPolicy - Generates the custom index set for this example -*/ + */ double solution(double x, double y); -void computeErr(double *I, grid_s grid); -RAJA::TypedIndexSet - gsColorPolicy(int N, camp::resources::Resource res); +void computeErr(double* I, grid_s grid); +RAJA::TypedIndexSet +gsColorPolicy(int N, camp::resources::Resource res); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout<<"Red-Black Gauss-Seidel Example"<(NN); + double* I = resource.allocate(NN); memset(I, 0, NN * sizeof(double)); @@ -115,9 +116,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using colorPolicy = RAJA::ExecPolicy; #endif - resI2 = 1; + resI2 = 1; iteration = 0; - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { #if defined(RAJA_ENABLE_OPENMP) RAJA::ReduceSum RAJA_resI2(0.0); @@ -128,33 +130,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Gauss-Seidel Iteration // - RAJA::forall(colorSet, - [=](RAJA::Index_type id) { - - // - // Compute x,y grid index - // - int m = id % (N + 2); - int n = id / (N + 2); - - double x = gridx.o + m * gridx.h; - double y = gridx.o + n * gridx.h; - - double f = gridx.h * gridx.h * - (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); - - double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] - - I[id - 1] - I[id + 1]); - - double oldI = I[id]; - RAJA_resI2 += (newI - oldI) * (newI - oldI); - I[id] = newI; - - }); + RAJA::forall( + colorSet, + [=](RAJA::Index_type id) + { + // + // Compute x,y grid index + // + int m = id % (N + 2); + int n = id / (N + 2); + + double x = gridx.o + m * gridx.h; + double y = gridx.o + n * gridx.h; + + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + + double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] - I[id - 1] - + I[id + 1]); + + double oldI = I[id]; + RAJA_resI2 += (newI - oldI) * (newI - oldI); + I[id] = newI; + }); resI2 = RAJA_resI2; - if (iteration > maxIter) { - std::cout<<"Gauss-Seidel maxed out on iterations"< maxIter) + { + std::cout << "Gauss-Seidel maxed out on iterations" << std::endl; break; } diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp index 27656435ac..de679235bd 100644 --- a/examples/resource-dynamic-forall.cpp +++ b/examples/resource-dynamic-forall.cpp @@ -44,11 +44,13 @@ using policy_list = camp::list(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = -i; b[i] = i; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } // _cstyle_vector_add_end checkResult(c, N); -//printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// Example of dynamic policy selection for forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Example of dynamic policy selection for forall + //----------------------------------------------------------------------------// RAJA::resources::Host host_res; #if defined(RAJA_ENABLE_CUDA) @@ -114,30 +121,33 @@ int main(int argc, char *argv[]) #endif #if defined(RAJA_ENABLE_SYCL) RAJA::resources::Sycl device_res; -#endif +#endif - //Get typed erased resource - it will internally store if we are running on the host or device -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) - RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); + // Get typed erased resource - it will internally store if we are running on + // the host or device +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) + RAJA::resources::Resource res = + RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); #else - RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); + RAJA::resources::Resource res = + RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); #endif - RAJA::expt::dynamic_forall - (res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { - - c[i] = a[i] + b[i]; - - }); + RAJA::expt::dynamic_forall(res, pol, RAJA::RangeSegment(0, N), + [=] RAJA_HOST_DEVICE(int i) + { + c[i] = a[i] + b[i]; + }); checkResult(c, N); - //printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// -// Clean up. -// + //----------------------------------------------------------------------------// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -153,12 +163,19 @@ int main(int argc, char *argv[]) void checkResult(int* res, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( res[i] != 0 ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (res[i] != 0) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -169,7 +186,8 @@ void checkResult(int* res, int len) void printResult(int* res, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << res[i] << std::endl; } std::cout << std::endl; diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp index 7373a5d6fb..5c7110a66a 100644 --- a/examples/resource-forall.cpp +++ b/examples/resource-forall.cpp @@ -18,7 +18,7 @@ * Vector Addition Example * * Computes c = a + b, where a, b, c are vectors of ints. - * It illustrates similarities between a C-style for-loop and a RAJA + * It illustrates similarities between a C-style for-loop and a RAJA * forall loop. * * RAJA features shown: @@ -35,283 +35,279 @@ // // Functions for checking and printing results // -void checkResult(int* res, int len); +void checkResult(int* res, int len); void printResult(int* res, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA vector addition example...\n"; -// -// Define vector length -// + // + // Define vector length + // const int N = 100000; -// -// Allocate and initialize vector data -// - RAJA::resources::Host host{}; + // + // Allocate and initialize vector data + // + RAJA::resources::Host host {}; - int *a = host.allocate(N); - int *b = host.allocate(N); - int *c = host.allocate(N); + int* a = host.allocate(N); + int* b = host.allocate(N); + int* c = host.allocate(N); - int *a_ = host.allocate(N); - int *b_ = host.allocate(N); - int *c_ = host.allocate(N); + int* a_ = host.allocate(N); + int* b_ = host.allocate(N); + int* c_ = host.allocate(N); - for (int i = 0; i < N; ++i) { - a[i] = -i; - b[i] = 2 * i; + for (int i = 0; i < N; ++i) + { + a[i] = -i; + b[i] = 2 * i; a_[i] = -i; b_[i] = 2 * i; - } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style vector addition...\n"; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces sequential execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces sequential execution.... + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential vector addition...\n"; - RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall(host, RAJA::RangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::sind_exec policy enforces simd execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::sind_exec policy enforces simd execution.... + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA simd_exec vector addition...\n"; - RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall(host, RAJA::RangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// RAJA::omp_for_parallel_exec policy execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_for_parallel_exec policy execution.... + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n"; RAJA::forall(host, RAJA::RangeSegment(0, N), - [=] (int i) { - c[i] = a[i] + b[i]; - }); + [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_static_exec policy execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_static_exec policy execution.... + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA omp_parallel_for_static_exec (default chunksize) vector addition...\n"; + std::cout << "\n Running RAJA omp_parallel_for_static_exec (default " + "chunksize) vector addition...\n"; -// clang-format off + // clang-format off RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; }); -// clang-format on + // clang-format on checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_dynamic_exec policy execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_dynamic_exec policy execution.... + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector addition...\n"; + std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector " + "addition...\n"; -// clang-format off + // clang-format off RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; }); -// clang-format on + // clang-format on checkResult(c, N); #endif +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) - -/* - GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block -*/ -const int GPU_BLOCK_SIZE = 256; + /* + GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block + */ + const int GPU_BLOCK_SIZE = 256; -//----------------------------------------------------------------------------// -// RAJA::cuda/hip_exec policy execution.... -//----------------------------------------------------------------------------// -{ - std::cout << "\n Running RAJA GPU vector addition on 2 seperate streams...\n"; + //----------------------------------------------------------------------------// + // RAJA::cuda/hip_exec policy execution.... + //----------------------------------------------------------------------------// + { + std::cout + << "\n Running RAJA GPU vector addition on 2 seperate streams...\n"; #if defined(RAJA_ENABLE_CUDA) - RAJA::resources::Cuda res_gpu1; - RAJA::resources::Cuda res_gpu2; - using EXEC_POLICY = RAJA::cuda_exec_async; + RAJA::resources::Cuda res_gpu1; + RAJA::resources::Cuda res_gpu2; + using EXEC_POLICY = RAJA::cuda_exec_async; #elif defined(RAJA_ENABLE_HIP) - RAJA::resources::Hip res_gpu1; - RAJA::resources::Hip res_gpu2; - using EXEC_POLICY = RAJA::hip_exec_async; + RAJA::resources::Hip res_gpu1; + RAJA::resources::Hip res_gpu2; + using EXEC_POLICY = RAJA::hip_exec_async; #elif defined(RAJA_ENABLE_SYCL) -RAJA::resources::Sycl res_gpu1; -RAJA::resources::Sycl res_gpu2; -using EXEC_POLICY = RAJA::sycl_exec; + RAJA::resources::Sycl res_gpu1; + RAJA::resources::Sycl res_gpu2; + using EXEC_POLICY = RAJA::sycl_exec; #endif - int* d_a1 = res_gpu1.allocate(N); - int* d_b1 = res_gpu1.allocate(N); - int* d_c1 = res_gpu1.allocate(N); + int* d_a1 = res_gpu1.allocate(N); + int* d_b1 = res_gpu1.allocate(N); + int* d_c1 = res_gpu1.allocate(N); - int* d_a2 = res_gpu2.allocate(N); - int* d_b2 = res_gpu2.allocate(N); - int* d_c2 = res_gpu2.allocate(N); + int* d_a2 = res_gpu2.allocate(N); + int* d_b2 = res_gpu2.allocate(N); + int* d_c2 = res_gpu2.allocate(N); - res_gpu1.memcpy(d_a1, a, sizeof(int)* N); - res_gpu1.memcpy(d_b1, b, sizeof(int)* N); + res_gpu1.memcpy(d_a1, a, sizeof(int) * N); + res_gpu1.memcpy(d_b1, b, sizeof(int) * N); - res_gpu2.memcpy(d_a2, a, sizeof(int)* N); - res_gpu2.memcpy(d_b2, b, sizeof(int)* N); + res_gpu2.memcpy(d_a2, a, sizeof(int) * N); + res_gpu2.memcpy(d_b2, b, sizeof(int) * N); - RAJA::forall(res_gpu1, RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c1[i] = d_a1[i] + d_b1[i]; - }); + RAJA::forall(res_gpu1, RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE(int i) + { d_c1[i] = d_a1[i] + d_b1[i]; }); - RAJA::forall(res_gpu2, RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c2[i] = d_a2[i] + d_b2[i]; - }); + RAJA::forall(res_gpu2, RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE(int i) + { d_c2[i] = d_a2[i] + d_b2[i]; }); - res_gpu1.memcpy(c, d_c1, sizeof(int)*N ); + res_gpu1.memcpy(c, d_c1, sizeof(int) * N); - res_gpu2.memcpy(c_, d_c2, sizeof(int)*N ); + res_gpu2.memcpy(c_, d_c2, sizeof(int) * N); - checkResult(c, N); - checkResult(c_, N); + checkResult(c, N); + checkResult(c_, N); - res_gpu1.deallocate(d_a1); - res_gpu1.deallocate(d_b1); - res_gpu1.deallocate(d_c1); + res_gpu1.deallocate(d_a1); + res_gpu1.deallocate(d_b1); + res_gpu1.deallocate(d_c1); - res_gpu2.deallocate(d_a2); - res_gpu2.deallocate(d_b2); - res_gpu2.deallocate(d_c2); -} + res_gpu2.deallocate(d_a2); + res_gpu2.deallocate(d_b2); + res_gpu2.deallocate(d_c2); + } -//----------------------------------------------------------------------------// -// RAJA::cuda/hip_exec policy with waiting event.... -//----------------------------------------------------------------------------// -{ - std::cout << "\n Running RAJA GPU vector with dependency between two seperate streams...\n"; + //----------------------------------------------------------------------------// + // RAJA::cuda/hip_exec policy with waiting event.... + //----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA GPU vector with dependency between two " + "seperate streams...\n"; #if defined(RAJA_ENABLE_CUDA) - // _raja_res_defres_start - RAJA::resources::Cuda res_gpu1; - RAJA::resources::Cuda res_gpu2; - RAJA::resources::Host res_host; + // _raja_res_defres_start + RAJA::resources::Cuda res_gpu1; + RAJA::resources::Cuda res_gpu2; + RAJA::resources::Host res_host; - using EXEC_POLICY = RAJA::cuda_exec_async; - // _raja_res_defres_end + using EXEC_POLICY = RAJA::cuda_exec_async; + // _raja_res_defres_end #elif defined(RAJA_ENABLE_HIP) - RAJA::resources::Hip res_gpu1; - RAJA::resources::Hip res_gpu2; - RAJA::resources::Host res_host; + RAJA::resources::Hip res_gpu1; + RAJA::resources::Hip res_gpu2; + RAJA::resources::Host res_host; - using EXEC_POLICY = RAJA::hip_exec_async; + using EXEC_POLICY = RAJA::hip_exec_async; #elif defined(RAJA_ENABLE_SYCL) - RAJA::resources::Sycl res_gpu1; - RAJA::resources::Sycl res_gpu2; - RAJA::resources::Host res_host; + RAJA::resources::Sycl res_gpu1; + RAJA::resources::Sycl res_gpu2; + RAJA::resources::Host res_host; - using EXEC_POLICY = RAJA::sycl_exec; + using EXEC_POLICY = RAJA::sycl_exec; #endif - // _raja_res_alloc_start - int* d_array1 = res_gpu1.allocate(N); - int* d_array2 = res_gpu2.allocate(N); - int* h_array = res_host.allocate(N); - // _raja_res_alloc_end - - // _raja_res_k1_start - RAJA::forall(res_gpu1, RAJA::RangeSegment(0,N), - [=] RAJA_HOST_DEVICE (int i) { - d_array1[i] = i; - } - ); - // _raja_res_k1_end - - // _raja_res_k2_start - RAJA::resources::Event e = RAJA::forall(res_gpu2, RAJA::RangeSegment(0,N), - [=] RAJA_HOST_DEVICE (int i) { - d_array2[i] = -1; - } - ); - // _raja_res_k2_end - - // _raja_res_wait_start - res_gpu2.wait_for(&e); - // _raja_res_wait_end - - // _raja_res_k3_start - RAJA::forall(res_gpu1, RAJA::RangeSegment(0,N), - [=] RAJA_HOST_DEVICE (int i) { - d_array1[i] *= d_array2[i]; - } - ); - // _raja_res_k3_end - - // _raja_res_memcpy_start - res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N); - // _raja_res_memcpy_end - - // _raja_res_k4_start - bool check = true; - RAJA::forall(res_host, RAJA::RangeSegment(0,N), - [&check, h_array] (int i) { - if(h_array[i] != -i) {check = false;} - } - ); - // _raja_res_k4_end - - std::cout << "\n result -- "; - if (check) std::cout << "PASS\n"; - else std::cout << "FAIL\n"; - - res_gpu1.deallocate(d_array1); - res_gpu2.deallocate(d_array2); - res_host.deallocate(h_array); - -} + // _raja_res_alloc_start + int* d_array1 = res_gpu1.allocate(N); + int* d_array2 = res_gpu2.allocate(N); + int* h_array = res_host.allocate(N); + // _raja_res_alloc_end + + // _raja_res_k1_start + RAJA::forall(res_gpu1, RAJA::RangeSegment(0, N), + [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; }); + // _raja_res_k1_end + + // _raja_res_k2_start + RAJA::resources::Event e = RAJA::forall( + res_gpu2, RAJA::RangeSegment(0, N), + [=] RAJA_HOST_DEVICE(int i) { d_array2[i] = -1; }); + // _raja_res_k2_end + + // _raja_res_wait_start + res_gpu2.wait_for(&e); + // _raja_res_wait_end + + // _raja_res_k3_start + RAJA::forall(res_gpu1, RAJA::RangeSegment(0, N), + [=] RAJA_HOST_DEVICE(int i) + { d_array1[i] *= d_array2[i]; }); + // _raja_res_k3_end + + // _raja_res_memcpy_start + res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N); + // _raja_res_memcpy_end + + // _raja_res_k4_start + bool check = true; + RAJA::forall(res_host, RAJA::RangeSegment(0, N), + [&check, h_array](int i) + { + if (h_array[i] != -i) + { + check = false; + } + }); + // _raja_res_k4_end + + std::cout << "\n result -- "; + if (check) + std::cout << "PASS\n"; + else + std::cout << "FAIL\n"; + + res_gpu1.deallocate(d_array1); + res_gpu2.deallocate(d_array2); + res_host.deallocate(h_array); + } #endif -// -// -// Clean up. -// + // + // + // Clean up. + // host.deallocate(a); host.deallocate(b); host.deallocate(c); @@ -328,15 +324,22 @@ using EXEC_POLICY = RAJA::sycl_exec; // // Function to check result and report P/F. // -void checkResult(int* res, int len) +void checkResult(int* res, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( res[i] != i ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (res[i] != i) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -347,7 +350,8 @@ void checkResult(int* res, int len) void printResult(int* res, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << res[i] << std::endl; } std::cout << std::endl; diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp index 64ed16b710..32e9f2389e 100644 --- a/examples/resource-kernel.cpp +++ b/examples/resource-kernel.cpp @@ -10,7 +10,7 @@ using namespace RAJA; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { #if defined(RAJA_ENABLE_CUDA) @@ -19,17 +19,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int N = 10; constexpr int M = 1000000; - RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()}; - RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()}; - int* d_array = def_cuda_res.allocate(N*M); - int* h_array = def_host_res.allocate(N*M); + RAJA::resources::Cuda def_cuda_res {RAJA::resources::Cuda::get_default()}; + RAJA::resources::Host def_host_res {RAJA::resources::Host::get_default()}; + int* d_array = def_cuda_res.allocate(N * M); + int* h_array = def_host_res.allocate(N * M); RAJA::RangeSegment one_range(0, 1); RAJA::RangeSegment m_range(0, M); RAJA::RangeSegment n_range(0, N); using TEST_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< statement::CudaKernelAsync< statement::For<0, cuda_block_x_loop, @@ -40,8 +40,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::forall(def_host_res, n_range, [=, &def_cuda_res](int i){ RAJA::resources::Cuda res_cuda; @@ -61,18 +61,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); int ec_count = 0; - RAJA::forall( RAJA::RangeSegment(0, N*M), - [=, &ec_count](int i){ - if (h_array[i] != i) ec_count++; - } - ); + RAJA::forall(RAJA::RangeSegment(0, N * M), + [=, &ec_count](int i) + { + if (h_array[i] != i) ec_count++; + }); std::cout << " Result -- "; - if (ec_count > 0) + if (ec_count > 0) std::cout << "FAIL : error count = " << ec_count << "\n"; else std::cout << "PASS!\n"; diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp index ac161c1e75..3dcb04905d 100644 --- a/examples/resource-launch.cpp +++ b/examples/resource-launch.cpp @@ -10,7 +10,7 @@ using namespace RAJA; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { #if defined(RAJA_ENABLE_CUDA) @@ -19,10 +19,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int N = 10; constexpr int M = 1000000; - RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()}; - RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()}; - int* d_array = def_cuda_res.allocate(N*M); - int* h_array = def_host_res.allocate(N*M); + RAJA::resources::Cuda def_cuda_res {RAJA::resources::Cuda::get_default()}; + RAJA::resources::Host def_host_res {RAJA::resources::Host::get_default()}; + int* d_array = def_cuda_res.allocate(N * M); + int* h_array = def_host_res.allocate(N * M); RAJA::RangeSegment one_range(0, 1); RAJA::RangeSegment m_range(0, M); @@ -34,7 +34,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using threads_x = RAJA::LoopPolicy; -// clang-format off + // clang-format off RAJA::forall(def_host_res, n_range, [=, &def_cuda_res](int i){ @@ -60,15 +60,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); int ec_count = 0; - RAJA::forall( RAJA::RangeSegment(0, N*M), - [=, &ec_count](int i){ - if (h_array[i] != i) ec_count++; - } - ); + RAJA::forall(RAJA::RangeSegment(0, N * M), + [=, &ec_count](int i) + { + if (h_array[i] != i) ec_count++; + }); std::cout << " Result -- "; if (ec_count > 0) diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp index 07b4ea51a0..a06f49c1e9 100644 --- a/examples/resource-runtime-launch.cpp +++ b/examples/resource-runtime-launch.cpp @@ -30,27 +30,29 @@ */ using host_launch = RAJA::seq_launch_t; -using host_loop = RAJA::seq_exec; +using host_loop = RAJA::seq_exec; #if defined(RAJA_ENABLE_CUDA) using device_launch = RAJA::cuda_launch_t; -using device_loop = RAJA::cuda_global_thread_x; +using device_loop = RAJA::cuda_global_thread_x; #elif defined(RAJA_ENABLE_HIP) using device_launch = RAJA::hip_launch_t; -using device_loop = RAJA::hip_global_thread_x; +using device_loop = RAJA::hip_global_thread_x; #endif using launch_policy = RAJA::LaunchPolicy; + >; using loop_pol = RAJA::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) using reduce_policy = RAJA::cuda_reduce; @@ -60,11 +62,13 @@ using reduce_policy = RAJA::hip_reduce; using reduce_policy = RAJA::seq_reduce; #endif -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW( + "Usage ./teams_reductions host or ./tut_reductions device"); } // @@ -73,75 +77,91 @@ int main(int argc, char *argv[]) // Example usage ./teams_reductions host or ./teams_reductions device // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW( + "Usage ./teams_reductions host or ./teams_reductions device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams reductions example on the host \n"); } - if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); } + if (exec_space.compare("host") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::HOST; + printf("Running RAJA-Teams reductions example on the host \n"); + } + if (exec_space.compare("device") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; + printf("Running RAJA-Teams reductions example on the device \n"); + } // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // const int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // const int minloc_ref = N / 2; - a[minloc_ref] = -100; + a[minloc_ref] = -100; const int maxloc_ref = N / 2 + 1; - a[maxloc_ref] = 100; + a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // -// -// Define index range for iterating over a elements in all examples -// + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::RangeSegment arange(0, N); // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// RAJA::ReduceSum kernel_sum(0); - RAJA::ReduceMin kernel_min(std::numeric_limits::max()); - RAJA::ReduceMax kernel_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc kernel_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc kernel_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMin kernel_min( + std::numeric_limits::max()); + RAJA::ReduceMax kernel_max( + std::numeric_limits::min()); + RAJA::ReduceMinLoc kernel_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc kernel_maxloc( + std::numeric_limits::min(), -1); const int TEAM_SZ = 256; - const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ); + const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ); RAJA::resources::Host host_res; @@ -152,15 +172,19 @@ int main(int argc, char *argv[]) RAJA::resources::Hip device_res; #endif - //Get typed erased resource - it will internally store if we are running on the host or device + // Get typed erased resource - it will internally store if we are running on + // the host or device #if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL) - RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); + RAJA::resources::Resource res = + RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); #else - RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); + RAJA::resources::Resource res = + RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); #endif - //How the kernel executes now depends on how the resource is constructed (host or device) -// clang-format off + // How the kernel executes now depends on how the resource is constructed + // (host or device) + // clang-format off RAJA::launch (res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)), @@ -177,21 +201,21 @@ int main(int argc, char *argv[]) }); }); -// clang-format on + // clang-format on std::cout << "\tsum = " << kernel_sum.get() << std::endl; std::cout << "\tmin = " << kernel_min.get() << std::endl; std::cout << "\tmax = " << kernel_max.get() << std::endl; std::cout << "\tmin, loc = " << kernel_minloc.get() << " , " - << kernel_minloc.getLoc() << std::endl; + << kernel_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , " - << kernel_maxloc.getLoc() << std::endl; + << kernel_maxloc.getLoc() << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/tut_daxpy.cpp b/examples/tut_daxpy.cpp index d9f1815261..ea801e7c34 100644 --- a/examples/tut_daxpy.cpp +++ b/examples/tut_daxpy.cpp @@ -15,12 +15,12 @@ * Daxpy Example * * Computes a += b*c, where a, b are vectors of doubles - * and c is a scalar double. It illustrates similarities between a - * C-style for-loop and a RAJA forall loop. + * and c is a scalar double. It illustrates similarities between a + * C-style for-loop and a RAJA forall loop. * * RAJA features shown: * - `forall` loop iteration template method - * - Index range segment + * - Index range segment * - Execution policies */ @@ -28,191 +28,192 @@ // Functions for checking and printing results // void checkResult(double* v1, double* v2, int len); -void printResult(double* v, int len); +void printResult(double* v, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA daxpy example...\n"; -// -// Define vector length -// + // + // Define vector length + // const int N = 1000000; -// -// Allocate and initialize vector data. -// - double* a0 = new double[N]; + // + // Allocate and initialize vector data. + // + double* a0 = new double[N]; double* aref = new double[N]; double* ta = new double[N]; double* tb = new double[N]; - + double c = 3.14159; - - for (int i = 0; i < N; i++) { + + for (int i = 0; i < N; i++) + { a0[i] = 1.0; tb[i] = 2.0; } -// -// Declare and set pointers to array data. -// We reset them for each daxpy version so that -// they all look the same. -// + // + // Declare and set pointers to array data. + // We reset them for each daxpy version so that + // they all look the same. + // double* a = ta; double* b = tb; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - for (int i = 0; i < N; ++i) { + std::memcpy(a, a0, N * sizeof(double)); + + for (int i = 0; i < N; ++i) + { a[i] += b[i] * c; } - std::memcpy( aref, a, N* sizeof(double) ); + std::memcpy(aref, a, N * sizeof(double)); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// In the following, we show a RAJA version -// of the daxpy operation and how it can -// be run differently by choosing different -// RAJA execution policies. -// -// Note that the only thing that changes in -// these versions is the execution policy. -// To implement these cases using the -// programming model choices directly, would -// require unique changes for each. -// - -//----------------------------------------------------------------------------// + // + // In the following, we show a RAJA version + // of the daxpy operation and how it can + // be run differently by choosing different + // RAJA execution policies. + // + // Note that the only thing that changes in + // these versions is the execution policy. + // To implement these cases using the + // programming model choices directly, would + // require unique changes for each. + // + + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - a[i] += b[i] * c; - }); + std::memcpy(a, a0, N * sizeof(double)); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) { a[i] += b[i] * c; }); checkResult(a, aref, N); -//printResult(a, N); + // printResult(a, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// RAJA SIMD version. -// + // + // RAJA SIMD version. + // std::cout << "\n Running RAJA SIMD daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - a[i] += b[i] * c; - }); + std::memcpy(a, a0, N * sizeof(double)); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) { a[i] += b[i] * c; }); checkResult(a, aref, N); -//printResult(a, N); + // printResult(a, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - a[i] += b[i] * c; - }); + std::memcpy(a, a0, N * sizeof(double)); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) { a[i] += b[i] * c; }); checkResult(a, aref, N); -//printResult(a, N); +// printResult(a, N); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -// -// RAJA CUDA parallel GPU version (256 threads per thread block). -// + // + // RAJA CUDA parallel GPU version (256 threads per thread block). + // std::cout << "\n Running RAJA CUDA daxpy...\n"; - a = 0; b = 0; - cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) )); - cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) )); - - cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); - cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); + a = 0; + b = 0; + cudaErrchk(cudaMalloc((void**)&a, N * sizeof(double))); + cudaErrchk(cudaMalloc((void**)&b, N * sizeof(double))); -// clang-format off + cudaErrchk(cudaMemcpy(a, a0, N * sizeof(double), cudaMemcpyHostToDevice)); + cudaErrchk(cudaMemcpy(b, tb, N * sizeof(double), cudaMemcpyHostToDevice)); + + // clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { a[i] += b[i] * c; }); -// clang-format on - cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost )); + // clang-format on + cudaErrchk(cudaMemcpy(ta, a, N * sizeof(double), cudaMemcpyDeviceToHost)); cudaErrchk(cudaFree(a)); cudaErrchk(cudaFree(b)); a = ta; checkResult(a, aref, N); -//printResult(a, N); +// printResult(a, N); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -// -// RAJA HIP parallel GPU version (256 threads per thread block). -// + // + // RAJA HIP parallel GPU version (256 threads per thread block). + // std::cout << "\n Running RAJA HIP daxpy...\n"; - a = 0; b = 0; - hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) )); - hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) )); + a = 0; + b = 0; + hipErrchk(hipMalloc((void**)&a, N * sizeof(double))); + hipErrchk(hipMalloc((void**)&b, N * sizeof(double))); - hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(a, a0, N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(b, tb, N * sizeof(double), hipMemcpyHostToDevice)); -// clang-format off + // clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { a[i] += b[i] * c; }); -// clang-format on - hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk(hipMemcpy(ta, a, N * sizeof(double), hipMemcpyDeviceToHost)); hipErrchk(hipFree(a)); hipErrchk(hipFree(b)); a = ta; checkResult(a, aref, N); -//printResult(a, N); +// printResult(a, N); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// - delete[] a0; - delete[] aref; - delete[] ta; + // + // Clean up. + // + delete[] a0; + delete[] aref; + delete[] ta; delete[] tb; - + std::cout << "\n DONE!...\n"; return 0; @@ -221,26 +222,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to compare result to reference and report P/F. // -void checkResult(double* v1, double* v2, int len) +void checkResult(double* v1, double* v2, int len) { bool match = true; - for (int i = 0; i < len; i++) { - if ( v1[i] != v2[i] ) { match = false; } + for (int i = 0; i < len; i++) + { + if (v1[i] != v2[i]) + { + match = false; + } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; - } + } } // -// Function to print result. +// Function to print result. // -void printResult(double* v, int len) +void printResult(double* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp index 7c7eb19832..411175af3d 100644 --- a/examples/tut_halo-exchange.cpp +++ b/examples/tut_halo-exchange.cpp @@ -34,16 +34,17 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using forall - CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using workgroup + CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when + using forall CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a + CUDA thread block when using workgroup */ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; +const int CUDA_BLOCK_SIZE = 256; const int CUDA_WORKGROUP_BLOCK_SIZE = 1024; #endif #if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; +const int HIP_BLOCK_SIZE = 256; const int HIP_WORKGROUP_BLOCK_SIZE = 1024; #endif @@ -69,22 +70,23 @@ void printResult(std::vector const& vars, int var_size, int num_vars); void create_pack_lists(std::vector& pack_index_lists, std::vector& pack_index_list_lengths, const int halo_width, const int* grid_dims); void create_unpack_lists(std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, -// clang-format on - const int halo_width, const int* grid_dims); + // clang-format on + const int halo_width, + const int* grid_dims); void destroy_pack_lists(std::vector& pack_index_lists); // clang-format on void destroy_unpack_lists(std::vector& unpack_index_lists); -template < typename T > +template struct memory_manager_allocator { using value_type = T; memory_manager_allocator() = default; - template < typename U > -// clang-format off + template + // clang-format off constexpr memory_manager_allocator(memory_manager_allocator const&) noexcept { } diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp index 9f6b6c63ff..ed6ae39741 100644 --- a/examples/tut_launch_basic.cpp +++ b/examples/tut_launch_basic.cpp @@ -31,7 +31,7 @@ * the example below choses a sequential * execution space and either a CUDA or HIP * execution device execution space. -*/ + */ // __host_launch_start using host_launch = RAJA::seq_launch_t; @@ -45,12 +45,12 @@ using device_launch = RAJA::cuda_launch_t; using device_launch = RAJA::hip_launch_t; #endif -using launch_policy = RAJA::LaunchPolicy< - host_launch +using launch_policy = RAJA::LaunchPolicy; + >; /* * RAJA launch exposes a thread/block programming model @@ -64,69 +64,70 @@ using launch_policy = RAJA::LaunchPolicy< * On the host the loops expands to standard C style for loops. */ -using teams_x = RAJA::LoopPolicy< - RAJA::seq_exec +using teams_x = RAJA::LoopPolicy; + >; -using teams_y = RAJA::LoopPolicy< - RAJA::seq_exec +using teams_y = RAJA::LoopPolicy; + >; using threads_x = RAJA::LoopPolicy; + >; using threads_y = RAJA::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) __global__ void gpuKernel() { - //Equivalent CUDA/HIP style thread/block mapping - // _device_loop_start - {int by = blockIdx.y; - {int bx = blockIdx.x; - - {int ty = threadIdx.y; - {int tx = blockIdx.x; - - printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d block_by %d \n", + // Equivalent CUDA/HIP style thread/block mapping + // _device_loop_start + { + int by = blockIdx.y; + { + int bx = blockIdx.x; + + { + int ty = threadIdx.y; + { + int tx = blockIdx.x; + + printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d " + "block_by %d \n", tx, ty, bx, by); - } } - } } // _device_loop_end @@ -142,40 +143,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW( + "Usage ./tut_launch_basic host or ./tut_launch_basic device"); } -// -// Run time policy section is demonstrated in this example by specifying -// kernel exection space as a command line argument (host or device). -// Example usage ./tut_launch_basic host or ./tut_launch_basic device -// + // + // Run time policy section is demonstrated in this example by specifying + // kernel exection space as a command line argument (host or device). + // Example usage ./tut_launch_basic host or ./tut_launch_basic device + // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW( + "Usage ./tut_launch_basic host or ./tut_launch_basic device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams on the host \n"); } - if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams on the device \n"); } + if (exec_space.compare("host") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::HOST; + printf("Running RAJA-Teams on the host \n"); + } + if (exec_space.compare("device") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; + printf("Running RAJA-Teams on the device \n"); + } -// -// The following three kernels illustrate loop based parallelism -// based on nested for loops. For correctness team and thread loops -// make the assumption that all work inside can be done -// concurrently. -// + // + // The following three kernels illustrate loop based parallelism + // based on nested for loops. For correctness team and thread loops + // make the assumption that all work inside can be done + // concurrently. + // // __compute_grid_start - const int Nteams = 2; + const int Nteams = 2; const int Nthreads = 2; // __compute_grid_end -// clang-format off + // clang-format off RAJA::launch(select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams), RAJA::Threads(Nthreads,Nthreads)), @@ -201,21 +212,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) }); -// clang-format on - //Equivalent C style loops - if(select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + // clang-format on + // Equivalent C style loops + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { // _c_style_loops_start - for (int by=0; by>>(); cudaDeviceSynchronize(); #endif #if defined(RAJA_ENABLE_HIP) - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) hipLaunchKernelGGL((gpuKernel), dim3(griddim), dim3(blockdim), 0, 0); hipDeviceSynchronize(); #endif diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp index 08325513d2..98a2a99427 100644 --- a/examples/tut_matrix-multiply.cpp +++ b/examples/tut_matrix-multiply.cpp @@ -64,9 +64,11 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; - if ( row < N && col < N ) { + if (row < N && col < N) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } @@ -79,7 +81,7 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) // Functions for checking results // template -void checkResult(T *C, int N); +void checkResult(T* C, int N); // clang-format off template @@ -90,7 +92,7 @@ void checkResult(RAJA::View> Cview, int N); // Functions for printing results // template -void printResult(T *C, int N); +void printResult(T* C, int N); // clang-format off template @@ -98,106 +100,110 @@ void printResult(RAJA::View> Cview, int N); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix multiplication example...\n"; -// -// Define num rows/cols in matrix -// + // + // Define num rows/cols in matrix + // const int N = 1000; -//const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE; + // const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE; -// -// Allocate and initialize matrix data. -// - double *A = memoryManager::allocate(N * N); - double *B = memoryManager::allocate(N * N); - double *C = memoryManager::allocate(N * N); - - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + // + // Allocate and initialize matrix data. + // + double* A = memoryManager::allocate(N * N); + double* B = memoryManager::allocate(N * N); + double* C = memoryManager::allocate(N * N); + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { A(row, col) = row; B(row, col) = col; } } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix multiplication...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_cstyle_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } C(row, col) = dot; - } } // _matmult_cstyle_end checkResult(C, N); -//printResult(C, N); + // printResult(C, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// We define RAJA range segments to define the ranges of -// row, column, and dot-product loops for RAJA variants -// + // + // We define RAJA range segments to define the ranges of + // row, column, and dot-product loops for RAJA variants + // // _matmult_ranges_start RAJA::TypedRangeSegment row_range(0, N); RAJA::TypedRangeSegment col_range(0, N); RAJA::TypedRangeSegment dot_range(0, N); // _matmult_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// For the RAJA implementations of matrix multiplication, we -// use RAJA 'View' objects to access the matrix data. A RAJA view -// holds a pointer to a data array and enables multi-dimensional indexing -// into that data, similar to the macros we defined above. -// + // + // For the RAJA implementations of matrix multiplication, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into that data, similar to the macros we defined above. + // // _matmult_views_start RAJA::View> Aview(A, N, N); RAJA::View> Bview(B, N, N); RAJA::View> Cview(C, N, N); // _matmult_views_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// In the next few examples, we show ways that we can use RAJA::forall -// statements for the matrix multiplication kernel. This usage is not -// recommended for performance reasons. Specifically, it limits the amount -// of parallelism that can be exposed to less than is possible. We show -// this usage here, to make this point clear. Later in this file, we -// introduce RAJA nested loop abstractions and show that we can extract all -// available parallelism. -// -// -// In the first RAJA implementation, we replace the outer 'row' loop -// with a RAJA::forall statement. The lambda expression contains the -// inner loops. -// + // + // In the next few examples, we show ways that we can use RAJA::forall + // statements for the matrix multiplication kernel. This usage is not + // recommended for performance reasons. Specifically, it limits the amount + // of parallelism that can be exposed to less than is possible. We show + // this usage here, to make this point clear. Later in this file, we + // introduce RAJA nested loop abstractions and show that we can extract all + // available parallelism. + // + // + // In the first RAJA implementation, we replace the outer 'row' loop + // with a RAJA::forall statement. The lambda expression contains the + // inner loops. + // -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential mat-mult (RAJA-row)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_outerforall_start -// clang-format off + // clang-format off RAJA::forall( row_range, [=](int row) { for (int col = 0; col < N; ++col) { @@ -212,31 +218,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_outerforall_end -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Next, we replace the outer 'row' loop and the inner 'col' loop -// with RAJA::forall statements. This will also work with parallel -// execution policies, such as OpenMP and CUDA, with caveats and -// restrictions. -// -// However, nesting RAJA::forall calls like this is not recommended as -// it limits the ability to expose parallelism and flexibility for -// implementation alternatives. -// + // + // Next, we replace the outer 'row' loop and the inner 'col' loop + // with RAJA::forall statements. This will also work with parallel + // execution policies, such as OpenMP and CUDA, with caveats and + // restrictions. + // + // However, nesting RAJA::forall calls like this is not recommended as + // it limits the ability to expose parallelism and flexibility for + // implementation alternatives. + // std::cout << "\n Running sequential mat-mult (RAJA-row, RAJA-col)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_nestedforall_start -// clang-format off + // clang-format off RAJA::forall( row_range, [=](int row) { RAJA::forall( col_range, [=](int col) { @@ -251,47 +257,47 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_nestedforall_end -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Next, we use a RAJA::kernel method to execute the kernel. These examples, -// illustrate the basic kernel interface and mechanics. The execution policies -// express the outer row and col loops using the RAJA kernel interface. Later, -// in this file we show some more complex policy examples where we express all -// three loops using the kernel interface and use additional kernel features. -// -// This is different than RAJA::forall and so a few points of exmplanation -// are in order: -// -// 1) A range and lambda index argument are required for each level in -// the loop nest. Here, we have two of each since we have a doubly-nested -// loop. -// 2) A range for each loop nest level is specified in a RAJA tuple object. -// The order of ranges in the tuple must match the order of args to the -// lambda for this to be correct, in general. RAJA provides strongly-typed -// indices to help with this. However, this example does not use them. -// 3) An execution policy is required for each level in the loop nest. These -// are specified in the 'RAJA::statement::For' templates in the -// 'RAJA::KernelPolicy type. -// 4) The loop nest ordering is specified in the nested execution policy -- -// the first 'For' policy is the outermost loop, the second 'For' policy -// is the loop nested inside the outermost loop, and so on. -// 5) The integer values that are the first template arguments to the policies -// indicate which range/lambda argument, the policy applies to. -// + // + // Next, we use a RAJA::kernel method to execute the kernel. These examples, + // illustrate the basic kernel interface and mechanics. The execution policies + // express the outer row and col loops using the RAJA kernel interface. Later, + // in this file we show some more complex policy examples where we express all + // three loops using the kernel interface and use additional kernel features. + // + // This is different than RAJA::forall and so a few points of exmplanation + // are in order: + // + // 1) A range and lambda index argument are required for each level in + // the loop nest. Here, we have two of each since we have a doubly-nested + // loop. + // 2) A range for each loop nest level is specified in a RAJA tuple object. + // The order of ranges in the tuple must match the order of args to the + // lambda for this to be correct, in general. RAJA provides strongly-typed + // indices to help with this. However, this example does not use them. + // 3) An execution policy is required for each level in the loop nest. These + // are specified in the 'RAJA::statement::For' templates in the + // 'RAJA::KernelPolicy type. + // 4) The loop nest ordering is specified in the nested execution policy -- + // the first 'For' policy is the outermost loop, the second 'For' policy + // is the loop nested inside the outermost loop, and so on. + // 5) The integer values that are the first template arguments to the policies + // indicate which range/lambda argument, the policy applies to. + // std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_basickernel_start using EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col @@ -300,8 +306,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -313,22 +319,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _matmult_basickernel_end -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_ompkernel_start using EXEC_POL1 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col @@ -337,9 +343,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_ompkernel_end -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -351,15 +357,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp inner)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // Swapping the template arguments in this nested policy swaps the loop @@ -370,7 +376,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _matmult_ompkernel_swap_start using EXEC_POL2 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::seq_exec, // col RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row @@ -379,9 +385,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_ompkernel_swap_end -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel( RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -393,15 +399,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy collapses the row and col loops in an OpenMP parallel region. @@ -409,7 +415,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // outer loop with a 'collapse(2) clause. // using EXEC_POL3 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col @@ -417,8 +423,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=](int col, int row) { @@ -431,16 +437,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); checkResult(Cview, N); // clang-format on -//printResult(Cview, N); -#endif // if RAJA_ENABLE_OPENMP +// printResult(Cview, N); +#endif // if RAJA_ENABLE_OPENMP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA mat-mult (RAJA-nested - POL4)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy replaces the loop nest with a single CUDA kernel launch @@ -454,7 +460,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // using EXEC_POL4 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, @@ -465,8 +471,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -478,15 +484,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); checkResult(Cview, N); -// clang-format on -//printResult(Cview, N); + // clang-format on + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tiled mat-mult (RAJA-POL5)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy collapses the col and row loops into a single CUDA kernel @@ -498,7 +504,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // same as in this kernel and the one above. // using EXEC_POL5 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, @@ -513,8 +519,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -526,26 +532,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); checkResult(Cview, N); -// clang-format on -//printResult(Cview, N); + // clang-format on + // printResult(Cview, N); -#endif // if RAJA_ENABLE_CUDA +#endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) - double *d_A = memoryManager::allocate_gpu(N * N); - double *d_B = memoryManager::allocate_gpu(N * N); - double *d_C = memoryManager::allocate_gpu(N * N); + double* d_A = memoryManager::allocate_gpu(N * N); + double* d_B = memoryManager::allocate_gpu(N * N); + double* d_C = memoryManager::allocate_gpu(N * N); std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); RAJA::View> d_Aview(d_A, N, N); RAJA::View> d_Bview(d_B, N, N); @@ -562,7 +568,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // using EXEC_POL4 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, @@ -573,8 +579,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -587,17 +593,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP tiled mat-mult (RAJA-POL5)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // // This policy collapses the col and row loops into a single HIP kernel @@ -609,7 +615,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // same as in this kernel and the one above. // using EXEC_POL5 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -626,8 +632,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE (int col, int row) { @@ -640,23 +646,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); -#endif // if RAJA_ENABLE_HIP +// printResult(Cview, N); +#endif // if RAJA_ENABLE_HIP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// The following examples use execution policies to express the outer row and -// col loops as well as the inner dot product loop using the RAJA kernel -// interface. They show some more complex policy examples and use additional -// kernel features. -// + // + // The following examples use execution policies to express the outer row and + // col loops as well as the inner dot product loop using the RAJA kernel + // interface. They show some more complex policy examples and use additional + // kernel features. + // - std::cout << "\n Running sequential mat-mult with multiple lambdas (RAJA-POL6a)...\n"; + std::cout << "\n Running sequential mat-mult with multiple lambdas " + "(RAJA-POL6a)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy executes the col, row and k (inner dot product) loops @@ -673,7 +680,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _matmult_3lambdakernel_seq_start using EXEC_POL6a = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -686,8 +693,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -710,30 +717,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); // _matmult_3lambdakernel_seq_end -// clang-format on + // clang-format on checkResult(Cview, N); - //printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); -// -// The following examples uses an extension of the lambda statement -// to specify lambda arguments. By specifying arguments within statements -// we remove the requirement that lambdas require all of the tuple contents. -// + // + // The following examples uses an extension of the lambda statement + // to specify lambda arguments. By specifying arguments within statements + // we remove the requirement that lambdas require all of the tuple contents. + // - std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda args in statements (RAJA-POL6b)...\n"; + std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda " + "args in statements (RAJA-POL6b)...\n"; // _matmult_3lambdakernel_args_seq_start // Alias for convenience - using RAJA::Segs; using RAJA::Params; + using RAJA::Segs; using EXEC_POL6b = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -746,8 +754,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -770,22 +778,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); // _matmult_3lambdakernel_args_seq_end -// clang-format on + // clang-format on checkResult(Cview, N); - //printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop collapse (RAJA-POL7)...\n"; + std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop " + "collapse (RAJA-POL7)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_3lambdakernel_ompcollapse_start using EXEC_POL7 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col @@ -797,9 +806,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_ompcollapse_end -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -822,22 +831,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); -#endif // if RAJA_ENABLE_OPENMP +// printResult(Cview, N); +#endif // if RAJA_ENABLE_OPENMP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL8)...\n"; + std::cout + << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL8)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_3lambdakernel_cuda_start using EXEC_POL8 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, // row @@ -852,9 +862,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_cuda_end -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -877,19 +887,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL9a)...\n"; + std::cout + << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL9a)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_3lambdakernel_cudatiled_start using EXEC_POL9a = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -910,9 +921,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_cudatiled_end -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -935,18 +946,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9b)...\n"; + std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args " + "in statements (RAJA-POL9b)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); using EXEC_POL9b = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -967,8 +979,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -991,15 +1003,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running mat-mult with tiling + shared memory...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // This example builds on the RAJA tiling capabilities presented earlier // and uses RAJA LocalArray's to load tiles of the global matrix @@ -1009,16 +1021,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // synchronization. We recommend viewing tut_matrix-transpose-local-array.cpp // for an introduction to RAJA LocalArray types and thread synchronization. - using Shmem = RAJA::LocalArray>; - - using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>; - using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>; - using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>; - using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>; - using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>; + using Shmem = + RAJA::LocalArray>; + + using shmem_Lambda0 = + RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>; + using shmem_Lambda1 = + RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, + RAJA::Params<0>>; + using shmem_Lambda2 = + RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, + RAJA::Params<1>>; + using shmem_Lambda3 = + RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>; + using shmem_Lambda4 = + RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, + RAJA::Params<2>>; using EXEC_POL10 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernelFixed//Cuda kernel >; -// clang-format on - Shmem aShared, bShared, cShared; + // clang-format on + Shmem aShared, bShared, cShared; -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -1120,27 +1142,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Cview, N); -//printResult(Cview, N); -#endif // if RAJA_ENABLE_CUDA +// printResult(Cview, N); +#endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // Define thread block dimensions dim3 blockdim(CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); // Launch CUDA kernel defined near the top of this file. matMultKernel<<>>(N, C, A, B); @@ -1148,22 +1171,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaDeviceSynchronize(); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -#endif // if RAJA_ENABLE_CUDA +#endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running HIP mat-mult with multiple lambdas (RAJA-POL8)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // _matmult_3lambdakernel_hip_start using EXEC_POL8 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, // row @@ -1179,9 +1202,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_hip_end -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -1204,22 +1227,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); -// clang-format on - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); //----------------------------------------------------------------------------// - std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9)...\n"; + std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in " + "statements (RAJA-POL9)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // _matmult_3lambdakernel_hiptiled_start using EXEC_POL9b = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -1240,9 +1264,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; // _matmult_3lambdakernel_hiptiled_end -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(col_range, row_range, dot_range), @@ -1265,45 +1289,47 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); -// clang-format on - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP tiled mat-mult (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // Define thread block dimensions dim3 blockdim(HIP_BLOCK_SIZE, HIP_BLOCK_SIZE); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); // Launch HIP kernel defined near the top of this file. - hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); + hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, + d_C, d_A, d_B); hipDeviceSynchronize(); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); memoryManager::deallocate_gpu(d_A); memoryManager::deallocate_gpu(d_B); memoryManager::deallocate_gpu(d_C); -#endif // if RAJA_ENABLE_HIP +#endif // if RAJA_ENABLE_HIP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp index 80d994ca99..e872590927 100644 --- a/examples/wave-eqn.cpp +++ b/examples/wave-eqn.cpp @@ -13,7 +13,7 @@ #include "RAJA/RAJA.hpp" /* - * Time-Domain Finite Difference + * Time-Domain Finite Difference * Acoustic Wave Equation Solver * * ------[Details]---------------------- @@ -26,7 +26,7 @@ * The scheme uses a second order central difference discretization * for time and a fourth order central difference discretization for space. * Periodic boundary conditions are assumed on the grid [-1,1] x [-1, 1]. - * + * * NOTE: The x and y dimensions are discretized identically. * ----[RAJA Concepts]------------------- * - RAJA kernels are portable and a single implemenation can run @@ -34,7 +34,7 @@ * * RAJA MaxReduction - RAJA's implementation for computing a maximum value * (MinReduction computes the min) -*/ + */ // // ---[Constant Values]------- @@ -42,7 +42,7 @@ // PI - Value of pi // -const int sr = 2; +const int sr = 2; const double PI = 3.14159265359; // @@ -51,7 +51,8 @@ const double PI = 3.14159265359; // h - Spacing between grid points // n - Number of grid points // -struct grid_s { +struct grid_s +{ double ox, dx; int nx; }; @@ -66,16 +67,17 @@ struct grid_s { // template -void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx); +void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx); double waveSol(double t, double x, double y); -void setIC(double *P1, double *P2, double t0, double t1, grid_s grid); -void computeErr(double *P, double tf, grid_s grid); +void setIC(double* P1, double* P2, double t0, double t1, grid_s grid); +void computeErr(double* P, double tf, grid_s grid); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout<<"Time-Domain Finite Difference Acoustic Wave Equation Solver"<(entries); - double *P2 = memoryManager::allocate(entries); + double* P1 = memoryManager::allocate(entries); + double* P2 = memoryManager::allocate(entries); // //----[Time stepping parameters]---- @@ -123,22 +125,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Sequential policy -// clang-format off + // clang-format off using fdPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; -// clang-format on + // clang-format on // OpenMP policy -// clang-format off + // clang-format off //using fdPolicy = RAJA::KernelPolicy< //RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; -// clang-format on + // clang-format on // CUDA policy - //using fdPolicy = -// clang-format off + // using fdPolicy = + // clang-format off //RAJA::KernelPolicy< // RAJA::statement::CudaKernel< // RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct, @@ -153,19 +155,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // > // >; -// clang-format on + // clang-format on time = 0; setIC(P1, P2, (time - dt), time, grid); - for (int k = 0; k < nt; ++k) { + for (int k = 0; k < nt; ++k) + { wave(P1, P2, fdBounds, ct, grid.nx); time += dt; - double *Temp = P2; - P2 = P1; - P1 = Temp; + double* Temp = P2; + P2 = P1; + P1 = Temp; } #if defined(RAJA_ENABLE_CUDA) cudaDeviceSynchronize(); @@ -191,31 +194,32 @@ double waveSol(double t, double x, double y) // // Error is computed via ||P_{approx}(:) - P_{analytic}(:)||_{inf} // -void computeErr(double *P, double tf, grid_s grid) +void computeErr(double* P, double tf, grid_s grid) { RAJA::RangeSegment fdBounds(0, grid.nx); RAJA::ReduceMax tMax(-1.0); -// clang-format off + // clang-format off using initialPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec , RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; -// clang-format on - RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), - [=] (RAJA::Index_type tx, RAJA::Index_type ty) { - - int id = tx + grid.nx * ty; - double x = grid.ox + tx * grid.dx; - double y = grid.ox + ty * grid.dx; - double myErr = std::abs(P[id] - waveSol(tf, x, y)); - - // - // tMax.max() is used to store the maximum value - // - tMax.max(myErr); - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(fdBounds, fdBounds), + [=](RAJA::Index_type tx, RAJA::Index_type ty) + { + int id = tx + grid.nx * ty; + double x = grid.ox + tx * grid.dx; + double y = grid.ox + ty * grid.dx; + double myErr = + std::abs(P[id] - waveSol(tf, x, y)); + + // + // tMax.max() is used to store the maximum value + // + tMax.max(myErr); + }); double lInfErr = tMax; printf("Max Error = %lg, dx = %f \n", lInfErr, grid.dx); @@ -225,29 +229,28 @@ void computeErr(double *P, double tf, grid_s grid) // // Function to set intial condition // -void setIC(double *P1, double *P2, double t0, double t1, grid_s grid) +void setIC(double* P1, double* P2, double t0, double t1, grid_s grid) { RAJA::RangeSegment fdBounds(0, grid.nx); -// clang-format off + // clang-format off using initialPolicy = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; - -// clang-format on - RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), - [=] (RAJA::Index_type tx, RAJA::Index_type ty) { - - int id = tx + ty * grid.nx; - double x = grid.ox + tx * grid.dx; - double y = grid.ox + ty * grid.dx; - - P1[id] = waveSol(t0, x, y); - P2[id] = waveSol(t1, x, y); - }); -} + // clang-format on + RAJA::kernel(RAJA::make_tuple(fdBounds, fdBounds), + [=](RAJA::Index_type tx, RAJA::Index_type ty) + { + int id = tx + ty * grid.nx; + double x = grid.ox + tx * grid.dx; + double y = grid.ox + ty * grid.dx; + + P1[id] = waveSol(t0, x, y); + P2[id] = waveSol(t1, x, y); + }); +} // clang-format off diff --git a/exercises/atomic-histogram.cpp b/exercises/atomic-histogram.cpp index 602a04a10e..55c683ba04 100644 --- a/exercises/atomic-histogram.cpp +++ b/exercises/atomic-histogram.cpp @@ -19,9 +19,9 @@ * * In this exercise, you will use use RAJA atomic operations to compute * an array which represents a histogram of values in another array. - * Given an array of length N containing integers in the interval [0, M), - * you will compute entries in an array 'hist' of length M. Each entry - * hist[i] in the histogram array will equal the number of occurrences of + * Given an array of length N containing integers in the interval [0, M), + * you will compute entries in an array 'hist' of length M. Each entry + * hist[i] in the histogram array will equal the number of occurrences of * the value 'i' in the orginal array. * * This file contains sequential and OpenMP variants of the histogram @@ -41,11 +41,11 @@ Specifies the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//const int CUDA_BLOCK_SIZE = 256; +// const int CUDA_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_HIP) -//const int HIP_BLOCK_SIZE = 256; +// const int HIP_BLOCK_SIZE = 256; #endif // @@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array bounds and initialize array to compute histogram of values - // on. + // on. // // _array_atomic_histogram_start @@ -70,33 +70,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) constexpr int N = 100000; int* array = memoryManager::allocate(N); - int* hist = memoryManager::allocate(M); + int* hist = memoryManager::allocate(M); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { array[i] = rand() % M; } // _array_atomic_histogram_end int* hist_ref = memoryManager::allocate(M); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential historgram...\n"; std::memset(hist_ref, 0, M * sizeof(int)); - for (int i = 0; i < N; ++i) { - hist_ref[ array[i] ]++; + for (int i = 0; i < N; ++i) + { + hist_ref[array[i]]++; } -//printArray(hist_ref, M); + // printArray(hist_ref, M); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -104,50 +106,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - #pragma omp parallel for - for (int i = 0; i < N; ++i) { - #pragma omp atomic - hist[ array[i] ]++; +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { +#pragma omp atomic + hist[array[i]]++; } checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -#endif +#endif -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential atomic histogram...\n"; std::memset(hist, 0, M * sizeof(int)); // _range_atomic_histogram_start - //RAJA::TypedRangeSegment array_range(0,N); + // RAJA::TypedRangeSegment array_range(0,N); // _range_atomic_histogram_end /// /// TODO... /// /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall - /// method with RAJA::seq_exec execution policy type and a + /// method with RAJA::seq_exec execution policy type and a /// RAJA::atomicAdd operation with RAJA::seq_atomic policy. /// /// You will need to uncomment the range segment definition /// above to use it in the kernel. /// - //RAJA::forall(array_range, [=](int i) { - //}); + // RAJA::forall(array_range, [=](int i) { + // }); checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -//----------------------------------------------------------------------------// -// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -159,44 +162,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall - /// method with RAJA::omp_parallel_for_exec execution policy type + /// method with RAJA::omp_parallel_for_exec execution policy type /// and a RAJA::atomicAdd operation with RAJA::omp_atomic policy. - /// + /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA OpenMP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA OpenMP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); /// /// TODO... /// /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall - /// method with RAJA::omp_parallel_for_exec execution policy type + /// method with RAJA::omp_parallel_for_exec execution policy type /// and a RAJA::atomicAdd operation with RAJA::auto_atomic policy. /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -216,20 +219,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA CUDA -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA CUDA + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); /// @@ -242,15 +245,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA hip_atomic policy is used with the RAJA HIP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA hip_atomic policy is used with the RAJA HIP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -270,20 +273,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA HIP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA HIP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); /// @@ -296,9 +299,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif @@ -321,12 +324,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void checkResult(int* hist, int* hist_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && hist[i] != hist_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && hist[i] != hist_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -337,7 +347,8 @@ void checkResult(int* hist, int* hist_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/exercises/atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp index 1e235dd025..fa6eca0de2 100644 --- a/exercises/atomic-histogram_solution.cpp +++ b/exercises/atomic-histogram_solution.cpp @@ -19,9 +19,9 @@ * * In this exercise, you will use use RAJA atomic operations to compute * an array which represents a histogram of values in another array. - * Given an array of length N containing integers in the interval [0, M), - * you will compute entries in an array 'hist' of length M. Each entry - * hist[i] in the histogram array will equal the number of occurrences of + * Given an array of length N containing integers in the interval [0, M), + * you will compute entries in an array 'hist' of length M. Each entry + * hist[i] in the histogram array will equal the number of occurrences of * the value 'i' in the orginal array. * * This file contains sequential and OpenMP variants of the histogram @@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array bounds and initialize array to compute histogram of values - // on. + // on. // // _array_atomic_histogram_start @@ -70,33 +70,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) constexpr int N = 100000; int* array = memoryManager::allocate(N); - int* hist = memoryManager::allocate(M); + int* hist = memoryManager::allocate(M); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { array[i] = rand() % M; } // _array_atomic_histogram_end int* hist_ref = memoryManager::allocate(M); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential historgram...\n"; std::memset(hist_ref, 0, M * sizeof(int)); - for (int i = 0; i < N; ++i) { - hist_ref[ array[i] ]++; + for (int i = 0; i < N; ++i) + { + hist_ref[array[i]]++; } -//printArray(hist_ref, M); + // printArray(hist_ref, M); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -104,45 +106,46 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - #pragma omp parallel for - for (int i = 0; i < N; ++i) { - #pragma omp atomic - hist[ array[i] ]++; +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { +#pragma omp atomic + hist[array[i]]++; } checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -#endif +#endif -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential atomic histogram...\n"; std::memset(hist, 0, M * sizeof(int)); - // _range_atomic_histogram_start - RAJA::TypedRangeSegment array_range(0,N); - // _range_atomic_histogram_end + // _range_atomic_histogram_start + RAJA::TypedRangeSegment array_range(0, N); + // _range_atomic_histogram_end -// clang-format off + // clang-format off RAJA::forall(array_range, [=](int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); -// clang-format on + // clang-format on checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -//----------------------------------------------------------------------------// -// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -150,50 +153,50 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - // _rajaomp_atomic_histogram_start -// clang-format off + // _rajaomp_atomic_histogram_start + // clang-format off RAJA::forall(array_range, [=](int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajaomp_atomic_histogram_end -// clang-format on + // clang-format on checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA OpenMP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA OpenMP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); -// clang-format off + // clang-format off RAJA::forall(array_range, [=](int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); - -// clang-format on + + // clang-format on checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -201,51 +204,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - // _rajacuda_atomic_histogram_start -// clang-format off + // _rajacuda_atomic_histogram_start + // clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajacuda_atomic_histogram_end -// clang-format on + // clang-format on checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA CUDA -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA CUDA + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); - // _rajacuda_atomicauto_histogram_start -// clang-format off + // _rajacuda_atomicauto_histogram_start + // clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajacuda_atomicauto_histogram_end -// clang-format on - + // clang-format on + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA hip_atomic policy is used with the RAJA HIP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA hip_atomic policy is used with the RAJA HIP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -253,45 +256,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - // _rajahip_atomic_histogram_start -// clang-format off + // _rajahip_atomic_histogram_start + // clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajahip_atomic_histogram_end -// clang-format on + // clang-format on checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA HIP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA HIP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); - // _rajahip_atomicauto_histogram_start -// clang-format off + // _rajahip_atomicauto_histogram_start + // clang-format off RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { RAJA::atomicAdd(&hist[array[i]], 1); }); // _rajahip_atomicauto_histogram_end -// clang-format on - + // clang-format on + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif @@ -314,12 +317,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void checkResult(int* hist, int* hist_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && hist[i] != hist_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && hist[i] != hist_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -330,7 +340,8 @@ void checkResult(int* hist, int* hist_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp index 0c6fba9b93..259d95ba36 100644 --- a/exercises/dot-product.cpp +++ b/exercises/dot-product.cpp @@ -14,9 +14,9 @@ /* * Vector Dot Product Exercise * - * Computes dot = (a,b), where a, b are vectors of + * Computes dot = (a,b), where a, b are vectors of * doubles and dot is a scalar double. It illustrates how RAJA - * supports a portable parallel reduction opertion in a way that + * supports a portable parallel reduction opertion in a way that * the code looks like it does in a sequential implementation. * * RAJA features shown: @@ -33,38 +33,40 @@ // void checkResult(double compdot, double refdot); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: vector dot product...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data -// - double *a = memoryManager::allocate(N); - double *b = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + double* a = memoryManager::allocate(N); + double* b = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = 1.0; b[i] = 1.0; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// C-style dot product operation. -// + // + // C-style dot product operation. + // std::cout << "\n Running C-version of dot product...\n"; // _csytle_dotprod_start double dot = 0.0; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { dot += a[i] * b[i]; } @@ -73,7 +75,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) double dot_ref = dot; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential dot product...\n"; @@ -83,19 +85,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::seq_exec - /// execution policy type and RAJA::seq_reduce. + /// execution policy type and RAJA::seq_reduce. /// /// NOTE: We've done this one for you to help you get started... /// RAJA::ReduceSum seqdot(0.0); -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { seqdot += a[i] * b[i]; }); -// clang-format on + // clang-format on dot = seqdot.get(); std::cout << "\t (a, b) = " << dot << std::endl; @@ -103,7 +105,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP dot product...\n"; @@ -113,8 +115,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement the dot product kernel using a RAJA::omp_parallel_for_exec - /// execution policy type and RAJA::omp_reduce reduction policy type. + /// EXERCISE: Implement the dot product kernel using a + /// RAJA::omp_parallel_for_exec + /// execution policy type and RAJA::omp_reduce reduction policy + /// type. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -123,11 +127,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//const int CUDA_BLOCK_SIZE = 256; + // const int CUDA_BLOCK_SIZE = 256; std::cout << "\n Running RAJA CUDA dot product...\n"; @@ -137,10 +141,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::cuda_exec - /// execution policy type and RAJA::cuda_reduce reduction policy type. - /// + /// execution policy type and RAJA::cuda_reduce reduction policy + /// type. + /// /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' above. - /// if you want to use it here. + /// if you want to use it here. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -148,30 +153,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//const int HIP_BLOCK_SIZE = 256; + // const int HIP_BLOCK_SIZE = 256; std::cout << "\n Running RAJA HIP dot product...\n"; dot = 0.0; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::hip_exec - /// execution policy type and RAJA::hip_reduce reduction policy type. - /// + /// execution policy type and RAJA::hip_reduce reduction policy + /// type. + /// /// NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' above - /// if you want to use it here. + /// if you want to use it here. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -182,11 +188,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_b); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) -//const int SYCL_BLOCK_SIZE = 256; + // const int SYCL_BLOCK_SIZE = 256; std::cout << "\n Running RAJA SYCL dot product...\n"; @@ -196,10 +202,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::sycl_exec - /// execution policy type and RAJA::sycl_reduce. + /// execution policy type and RAJA::sycl_reduce. /// /// NOTE: You will need to uncomment 'SYCL_BLOCK_SIZE' above - /// if you want to use it here. + /// if you want to use it here. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -208,7 +214,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// memoryManager::deallocate(a); @@ -224,10 +230,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // void checkResult(double compdot, double refdot) { - if ( compdot == refdot ) { + if (compdot == refdot) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } - diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp index c1d340935b..66634c8498 100644 --- a/exercises/dot-product_solution.cpp +++ b/exercises/dot-product_solution.cpp @@ -16,9 +16,9 @@ /* * Vector Dot Product Exercise * - * Computes dot = (a,b), where a, b are vectors of + * Computes dot = (a,b), where a, b are vectors of * doubles and dot is a scalar double. It illustrates how RAJA - * supports a portable parallel reduction opertion in a way that + * supports a portable parallel reduction opertion in a way that * the code looks like it does in a sequential implementation. * * RAJA features shown: @@ -35,38 +35,40 @@ // void checkResult(double compdot, double refdot); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: vector dot product...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data -// - double *a = memoryManager::allocate(N); - double *b = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + double* a = memoryManager::allocate(N); + double* b = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = 1.0; b[i] = 1.0; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// C-style dot product operation. -// + // + // C-style dot product operation. + // std::cout << "\n Running C-version of dot product...\n"; // _csytle_dotprod_start double dot = 0.0; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { dot += a[i] * b[i]; } @@ -75,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) double dot_ref = dot; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential dot product...\n"; @@ -84,12 +86,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajaseq_dotprod_start RAJA::ReduceSum seqdot(0.0); -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { seqdot += a[i] * b[i]; }); -// clang-format on + // clang-format on dot = seqdot.get(); // _rajaseq_dotprod_end @@ -98,7 +100,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP dot product...\n"; @@ -108,9 +110,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajaomp_dotprod_start RAJA::ReduceSum ompdot(0.0); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - ompdot += a[i] * b[i]; - }); + RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) + { ompdot += a[i] * b[i]; }); dot = ompdot.get(); // _rajaomp_dotprod_end @@ -121,7 +122,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -134,13 +135,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajacuda_dotprod_start RAJA::ReduceSum cudot(0.0); -// clang-format off + // clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { cudot += a[i] * b[i]; - }); + }); -// clang-format on + // clang-format on dot = cudot.get(); // _rajacuda_dotprod_end @@ -149,7 +150,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -159,22 +160,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) dot = 0.0; - double *d_a = memoryManager::allocate_gpu(N); - double *d_b = memoryManager::allocate_gpu(N); + double* d_a = memoryManager::allocate_gpu(N); + double* d_b = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(double), hipMemcpyHostToDevice)); // _rajahip_dotprod_start RAJA::ReduceSum hpdot(0.0); -// clang-format off + // clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { hpdot += d_a[i] * d_b[i]; }); -// clang-format on + // clang-format on dot = hpdot.get(); // _rajahip_dotprod_end @@ -186,7 +187,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_b); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) @@ -199,13 +200,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajasycl_dotprod_start RAJA::ReduceSum hpdot(0.0); -// clang-format off + // clang-format off RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { hpdot += a[i] * b[i]; }); -// clang-format on + // clang-format on dot = static_cast(hpdot.get()); // _rajasycl_dotprod_end @@ -215,7 +216,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// memoryManager::deallocate(a); @@ -231,10 +232,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // void checkResult(double compdot, double refdot) { - if ( compdot == refdot ) { + if (compdot == refdot) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } - diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp index 301cea4f07..4d61f4d1d3 100644 --- a/exercises/kernel-matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -70,7 +70,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; @@ -91,8 +91,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -108,8 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -124,8 +126,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -136,14 +140,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -155,19 +162,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -190,8 +199,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _mattranspose_localarray_start - using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + using TILE_MEM = RAJA::LocalArray, + RAJA::SizeList>; TILE_MEM Tile_Array; // _mattranspose_localarray_end @@ -219,19 +228,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Initialize the local memory statement as position 2 + /// EXERCISE: Initialize the local memory statement as position 2 /// in the paramater list. /// - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::Lambda<0> + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, +RAJA::seq_exec, RAJA::statement::ForICount<0, RAJA::statement::Param<1>, +RAJA::seq_exec, RAJA::statement::Lambda<0> > >, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::Lambda<1> + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, +RAJA::seq_exec, RAJA::statement::ForICount<1, RAJA::statement::Param<0>, +RAJA::seq_exec, RAJA::statement::Lambda<1> > > @@ -242,7 +251,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // clang-format on // clang-format off - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -258,7 +267,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); */ -// clang-format on + // clang-format on // _mattranspose_localarray_raja_end checkResult(Atview, N_c, N_r); @@ -286,8 +295,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, +RAJA::omp_parallel_for_exec, RAJA::statement::Tile<0, +RAJA::tile_fixed, RAJA::seq_exec, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -302,7 +312,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Use two ForICount statements with seq_exec to call the first lambda. + /// EXERCISE: Use two ForICount statements with seq_exec to call the +first lambda. /// // @@ -317,7 +328,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Use two ForICount statements with seq_exec to call the second lambda. + /// EXERCISE: Use two ForICount statements with seq_exec to call the +second lambda. /// > > @@ -344,7 +356,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); */ -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -356,7 +368,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_2_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< // // (0) Execution policies for outer loops @@ -398,8 +410,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -418,7 +430,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_r, N_c); #endif @@ -430,7 +442,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using CUDA_EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< // @@ -480,9 +492,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -501,19 +513,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -525,11 +537,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); using HIP_EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< // @@ -579,9 +592,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -600,22 +613,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n"; + std::cout << "\n Running RAJA - sequential matrix transpose exercise with " + "args in statement ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - //Alias for convenience - using RAJA::Segs; + // Alias for convenience using RAJA::Offsets; using RAJA::Params; + using RAJA::Segs; // _mattranspose_localarray_raja_lambdaargs_start /// @@ -634,7 +649,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> > + RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, +Offsets<1>, Params<0> > > >, @@ -651,7 +667,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // clang-format on // clang-format off - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -666,13 +682,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); */ -// clang-format on + // clang-format on // _mattranspose_localarray_raja_lambdaargs_end checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index 60a1271d34..1a7ccb6fc6 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -66,7 +66,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; @@ -87,8 +87,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -104,8 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -120,8 +122,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -132,14 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -151,19 +158,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -186,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _mattranspose_localarray_start - using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + using TILE_MEM = RAJA::LocalArray, + RAJA::SizeList>; TILE_MEM Tile_Array; // _mattranspose_localarray_end @@ -201,7 +210,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _mattranspose_localarray_raja_start using SEQ_EXEC_POL_I = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -225,8 +234,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -243,7 +252,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); // _mattranspose_localarray_raja_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -257,7 +266,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_1_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< // // (0) Execution policies for outer loops @@ -299,8 +308,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -319,7 +328,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -330,7 +339,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_2_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< // // (0) Execution policies for outer loops @@ -372,8 +381,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -392,7 +401,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_r, N_c); #endif @@ -404,7 +413,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using CUDA_EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< // @@ -454,9 +463,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -475,19 +484,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -499,11 +508,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); using HIP_EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< // @@ -553,9 +563,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on -// clang-format off + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -574,26 +584,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); -// clang-format on - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n"; + std::cout << "\n Running RAJA - sequential matrix transpose exercise with " + "args in statement ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - //Alias for convenience - using RAJA::Segs; + // Alias for convenience using RAJA::Offsets; using RAJA::Params; + using RAJA::Segs; // _raja_mattranspose_lambdaargs_start using SEQ_EXEC_POL_II = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -617,8 +629,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -634,11 +646,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_mattranspose_lambdaargs_start -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp index c96daa700e..2d549ada70 100644 --- a/exercises/kernel-matrix-transpose-tiled.cpp +++ b/exercises/kernel-matrix-transpose-tiled.cpp @@ -21,14 +21,14 @@ * transposed and returned as a second matrix At. * * This operation is carried out using a tiling algorithm. - * The algorithm iterates over tiles of the matrix A and + * The algorithm iterates over tiles of the matrix A and * performs a transpose copy without explicitly storing the tile. * * The algorithm is expressed as a collection of ``outer`` - * and ``inner`` for loops. Iterations of the inner loop will + * and ``inner`` for loops. Iterations of the inner loop will * tranpose tile entries; while outer loops will iterate over * the number of tiles needed to carryout the transpose. - * We do not assume that tiles divide the number of rows and + * We do not assume that tiles divide the number of rows and * and columns of the matrix. * * RAJA features shown: @@ -60,7 +60,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -81,8 +81,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -98,12 +98,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; @@ -114,24 +116,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end @@ -142,12 +148,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Further partioning of the iteration space is carried out in the + // Further partioning of the iteration space is carried out in the // tile_fixed statements. Iterations inside a RAJA loop is given by their - // global iteration number. + // global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -158,7 +164,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. The template parameter inside + // using sequential loops. The template parameter inside // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start @@ -172,8 +178,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// NOTE: We have done this first one for you. /// - using TILED_KERNEL_EXEC_POL = -// clang-format off + using TILED_KERNEL_EXEC_POL = + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -186,11 +192,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { Atview(col, row) = Aview(row, col); }); // _raja_tiled_mattranspose_end checkResult(Atview, N_c, N_r); @@ -198,7 +203,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -220,7 +226,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, + row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -230,7 +237,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// - std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running openmp tiled matrix transpose - collapsed inner " + "loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -239,8 +247,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // into a single OpenMP parallel for loop enabling parallel loads/reads // to/from the tile. // - using TILED_KERNEL_EXEC_POL_OMP2 = -// clang-format off + using TILED_KERNEL_EXEC_POL_OMP2 = + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -251,25 +259,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 0 > // closes Tile 1 >; // closes policy list - -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + + // clang-format on + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start /// @@ -285,7 +292,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, + row_Range), [=] RAJA_DEVICE (int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -293,10 +301,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; @@ -308,11 +316,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); using TILED_KERNEL_EXEC_POL_HIP = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, @@ -327,15 +336,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - d_Atview(col, row) = d_Aview(row, col); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row) + { d_Atview(col, row) = d_Aview(row, col); }); - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// @@ -350,7 +359,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. diff --git a/exercises/kernel-matrix-transpose-tiled_solution.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp index 66bf54c505..76e8a9b56d 100644 --- a/exercises/kernel-matrix-transpose-tiled_solution.cpp +++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp @@ -21,14 +21,14 @@ * transposed and returned as a second matrix At. * * This operation is carried out using a tiling algorithm. - * The algorithm iterates over tiles of the matrix A and + * The algorithm iterates over tiles of the matrix A and * performs a transpose copy without explicitly storing the tile. * * The algorithm is expressed as a collection of ``outer`` - * and ``inner`` for loops. Iterations of the inner loop will + * and ``inner`` for loops. Iterations of the inner loop will * tranpose tile entries; while outer loops will iterate over * the number of tiles needed to carryout the transpose. - * We do not assume that tiles divide the number of rows and + * We do not assume that tiles divide the number of rows and * and columns of the matrix. * * RAJA features shown: @@ -60,7 +60,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -81,8 +81,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -98,12 +98,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; @@ -114,24 +116,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end @@ -142,12 +148,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Further partioning of the iteration space is carried out in the + // Further partioning of the iteration space is carried out in the // tile_fixed statements. Iterations inside a RAJA loop is given by their - // global iteration number. + // global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -158,12 +164,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. The template parameter inside + // using sequential loops. The template parameter inside // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start - using TILED_KERNEL_EXEC_POL = -// clang-format off + using TILED_KERNEL_EXEC_POL = + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -176,11 +182,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { Atview(col, row) = Aview(row, col); }); // _raja_tiled_mattranspose_end checkResult(Atview, N_c, N_r); @@ -188,7 +193,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -196,8 +202,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops over tiles sequentially while exposing parallelism on // one of the inner loops. // - using TILED_KERNEL_EXEC_POL_OMP = -// clang-format off + using TILED_KERNEL_EXEC_POL_OMP = + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -208,19 +214,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > > > - >; + >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// - std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running openmp tiled matrix transpose - collapsed inner " + "loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -229,8 +235,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // into a single OpenMP parallel for loop enabling parallel loads/reads // to/from the tile. // - using TILED_KERNEL_EXEC_POL_OMP2 = -// clang-format off + using TILED_KERNEL_EXEC_POL_OMP2 = + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, @@ -241,28 +247,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 0 > // closes Tile 1 >; // closes policy list - -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + + // clang-format on + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start - using TILED_KERNEL_EXEC_POL_CUDA = -// clang-format off + using TILED_KERNEL_EXEC_POL_CUDA = + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, @@ -277,18 +282,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row) + { Atview(col, row) = Aview(row, col); }); // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; @@ -300,11 +304,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); using TILED_KERNEL_EXEC_POL_HIP = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, @@ -319,15 +324,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - d_Atview(col, row) = d_Aview(row, col); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE(int col, int row) + { d_Atview(col, row) = d_Aview(row, col); }); - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// @@ -342,7 +347,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. diff --git a/exercises/kernel-matrix-transpose.cpp b/exercises/kernel-matrix-transpose.cpp index 32c8664e56..77e7a57c23 100644 --- a/exercises/kernel-matrix-transpose.cpp +++ b/exercises/kernel-matrix-transpose.cpp @@ -48,7 +48,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -64,8 +64,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -81,12 +81,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -94,9 +96,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -108,13 +112,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // -//RAJA::TypedRangeSegment row_Range(0, N_r); -//RAJA::TypedRangeSegment col_Range(0, N_c); + // RAJA::TypedRangeSegment row_Range(0, N_r); + // RAJA::TypedRangeSegment col_Range(0, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running sequential matrix transpose ...\n"; @@ -122,7 +126,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start @@ -131,9 +135,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a /// basic matrix transpose. - /// + /// /// Uncomment 'row_Range' and 'col_Range' objects above so they - /// can be used in the kernel. + /// can be used in the kernel. /// /// @@ -153,7 +157,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout + << "\n Running openmp matrix transpose - parallel top inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -167,9 +172,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a /// basic matrix transpose. - /// + /// /// Uncomment 'row_Range' and 'col_Range' objects above so they - /// can be used in the kernel. + /// can be used in the kernel. /// /// @@ -178,7 +183,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -187,13 +192,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start /// @@ -201,9 +206,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a /// basic matrix transpose. - /// + /// /// Uncomment 'row_Range' and 'col_Range' objects above so they - /// can be used in the kernel. + /// can be used in the kernel. /// /// @@ -212,7 +217,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE (int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -220,10 +225,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -234,7 +239,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp index 691c895344..4b5fdd7d49 100644 --- a/exercises/kernel-matrix-transpose_solution.cpp +++ b/exercises/kernel-matrix-transpose_solution.cpp @@ -44,7 +44,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -104,10 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -118,11 +122,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start - using KERNEL_EXEC_POL = -// clang-format off + using KERNEL_EXEC_POL = + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, @@ -131,18 +135,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { Atview(col, row) = Aview(row, col); }); // _raja_mattranspose_end checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout + << "\n Running openmp matrix transpose - parallel top inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -150,35 +154,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops sequentially while exposing parallelism on // one of the inner loops. // - using KERNEL_EXEC_POL_OMP = -// clang-format off + using KERNEL_EXEC_POL_OMP = + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > - >; + >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start - using KERNEL_EXEC_POL_CUDA = -// clang-format off + using KERNEL_EXEC_POL_CUDA = + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_thread_x_loop, @@ -189,18 +192,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE(int col, int row) + { Atview(col, row) = Aview(row, col); }); // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -211,7 +213,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. @@ -220,16 +222,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -241,11 +249,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,64 +59,71 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// - constexpr int N = 100; + // _init_define_start + // + // 3D tensor has N^3 entries + // + constexpr int N = 100; constexpr int N_tot = N * N * N; - constexpr double c = 0.0001; - double* a = memoryManager::allocate(N_tot); - double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; @@ -135,38 +143,41 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_outer_start + // _raja_tensorinit_omp_outer_start using EXEC_POL2 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -177,8 +188,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -189,40 +200,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_outer_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_collapse_start - #pragma omp parallel for collapse(3) - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_collapse_start +#pragma omp parallel for collapse(3) + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_collapse_end + // _cstyle_tensorinit_omp_collapse_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_collapse_start + // _raja_tensorinit_omp_collapse_start using EXEC_POL3 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j, i @@ -230,8 +244,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -242,11 +256,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_collapse_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n"; @@ -265,23 +279,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(a, a_ref, N_tot); -#endif // if defined(RAJA_ENABLE_OPENMP) +#endif // if defined(RAJA_ENABLE_OPENMP) #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start + // _raja_tensorinit_cuda_start using EXEC_POL5 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k @@ -294,8 +308,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -306,11 +320,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; @@ -320,16 +334,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; -// _cuda_blockdim_end + // _cuda_blockdim_end -// _raja_tensorinit_cuda_tiled_direct_start + // _raja_tensorinit_cuda_tiled_direct_start using EXEC_POL6 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -348,8 +362,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -360,60 +374,60 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_tiled_direct_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); -// clang-format off + // clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); -// clang-format on + // clang-format on nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); -#endif // if defined(RAJA_ENABLE_CUDA) +#endif // if defined(RAJA_ENABLE_CUDA) #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_device_view_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_device_view_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start + // _raja_tensorinit_hip_start using EXEC_POL7 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k @@ -426,8 +440,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -438,12 +452,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; @@ -457,11 +471,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_tiled_direct_start + // _raja_tensorinit_hip_tiled_direct_start using EXEC_POL8 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -480,8 +494,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -492,16 +506,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_tiled_direct_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); -#endif // if defined(RAJA_ENABLE_HIP) +#endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -520,14 +534,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp index e74ec473cb..48033ab2e1 100644 --- a/exercises/kernelintro-execpols_solution.cpp +++ b/exercises/kernelintro-execpols_solution.cpp @@ -37,16 +37,17 @@ #if defined(RAJA_ENABLE_CUDA) // _cuda_tensorinit_kernel_start -template< int i_block_size, int j_block_size, int k_block_size > -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,73 +59,80 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// - constexpr int N = 100; + // _init_define_start + // + // 3D tensor has N^3 entries + // + constexpr int N = 100; constexpr int N_tot = N * N * N; - constexpr double c = 0.0001; - double* a = memoryManager::allocate(N_tot); - double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_seq_start + // _raja_tensorinit_seq_start using EXEC_POL1 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::seq_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -135,8 +143,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -147,44 +155,47 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_seq_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_outer_start + // _raja_tensorinit_omp_outer_start using EXEC_POL2 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -195,8 +206,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -207,40 +218,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_outer_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_collapse_start - #pragma omp parallel for collapse(3) - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_collapse_start +#pragma omp parallel for collapse(3) + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_collapse_end + // _cstyle_tensorinit_omp_collapse_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_collapse_start + // _raja_tensorinit_omp_collapse_start using EXEC_POL3 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j, i @@ -248,8 +262,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -260,20 +274,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_collapse_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_collapse_start + // _raja_tensorinit_omp_collapse_start using EXEC_POL4 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j @@ -283,8 +297,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -295,27 +309,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_omp_collapse_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -#endif // if defined(RAJA_ENABLE_OPENMP) +#endif // if defined(RAJA_ENABLE_OPENMP) #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start + // _raja_tensorinit_cuda_start using EXEC_POL5 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k @@ -328,8 +342,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -340,11 +354,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; @@ -354,16 +368,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; -// _cuda_blockdim_end + // _cuda_blockdim_end -// _raja_tensorinit_cuda_tiled_direct_start + // _raja_tensorinit_cuda_tiled_direct_start using EXEC_POL6 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -382,8 +396,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -394,60 +408,60 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_cuda_tiled_direct_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); -// clang-format off + // clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); -// clang-format on + // clang-format on nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); -#endif // if defined(RAJA_ENABLE_CUDA) +#endif // if defined(RAJA_ENABLE_CUDA) #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_device_view_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_device_view_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start + // _raja_tensorinit_hip_start using EXEC_POL7 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k @@ -460,8 +474,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -472,12 +486,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; @@ -491,11 +505,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_tiled_direct_start + // _raja_tensorinit_hip_tiled_direct_start using EXEC_POL8 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -514,8 +528,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::kernel( RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), @@ -526,16 +540,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _raja_tensorinit_hip_tiled_direct_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); -#endif // if defined(RAJA_ENABLE_HIP) +#endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -554,14 +568,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp index 2d0736ac07..3d98b2517b 100644 --- a/exercises/kernelintro-nested-loop-reorder.cpp +++ b/exercises/kernelintro-nested-loop-reorder.cpp @@ -14,10 +14,10 @@ * Nested Loop Basics and Loop Reordering (RAJA::kernel) * * In this exercise, we introduce basic RAJA::kernel mechanics for executing - * nested loop kernels, including using execution policies to permute the - * order of loops in a loop nest. The exercise performs no actual + * nested loop kernels, including using execution policies to permute the + * order of loops in a loop nest. The exercise performs no actual * computation and just prints out loop indices to show different - * loop ordering. Also, to avoid difficulty in interpreting parallel + * loop ordering. Also, to avoid difficulty in interpreting parallel * output, the execution policies use sequential execution. * * RAJA features shown: @@ -28,18 +28,18 @@ // // Define three named loop index integer types used in the triply-nested loops. -// These will trigger compilation errors if lambda index argument ordering +// These will trigger compilation errors if lambda index argument ordering // and types do not match the typed range index ordering. See final // example in this file. // // _raja_typed_indices_start RAJA_INDEX_VALUE_T(KIDX, int, "KIDX"); -RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); -RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); +RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); +RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); // _raja_typed_indices_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { // _range_min_max_start @@ -51,42 +51,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int kmax = 4; // _range_min_max_end -// -// The RAJA variants of the loop nest use the following typed range segments -// based on the typed indices defined above, outside of main(). -// + // + // The RAJA variants of the loop nest use the following typed range segments + // based on the typed indices defined above, outside of main(). + // // _raja_typed_index_ranges_start RAJA::TypedRangeSegment KRange(kmin, kmax); RAJA::TypedRangeSegment JRange(jmin, jmax); RAJA::TypedRangeSegment IRange(imin, imax); // _raja_typed_index_ranges_end - + std::cout << "\n\nRAJA::kernel nested loop reorder example...\n"; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + std::cout + << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_kji_loops_start - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_kji_loops_end -//----------------------------------------------------------------------------// - - std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + + std::cout + << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_kji_loops_start -// clang-format off + // clang-format off using KJI_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::seq_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -97,72 +106,87 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) + { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); // _raja_kji_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout + << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_jik_loops_start - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_jik_loops_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; /// /// TODO... /// - /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, + /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, /// i on middle loop, and k on inner loop /// -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout + << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_ikj_loops_start - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_ikj_loops_end -//----------------------------------------------------------------------------// - + //----------------------------------------------------------------------------// + std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; /// /// TODO... /// - /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, + /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, /// k on middle loop, and j on inner loop /// -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + #if 0 // Enable this code block to generate compiler error. //----------------------------------------------------------------------------// // The following demonstrates that code will not compile if lambda argument @@ -183,4 +207,3 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) return 0; } - diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp index ff55d11f07..844fc2ed66 100644 --- a/exercises/kernelintro-nested-loop-reorder_solution.cpp +++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp @@ -14,10 +14,10 @@ * Nested Loop Basics and Loop Reordering (RAJA::kernel) * * In this exercise, we introduce basic RAJA::kernel mechanics for executing - * nested loop kernels, including using execution policies to permute the - * order of loops in a loop nest. The exercise performs no actual + * nested loop kernels, including using execution policies to permute the + * order of loops in a loop nest. The exercise performs no actual * computation and just prints out loop indices to show different - * loop ordering. Also, to avoid difficulty in interpreting parallel + * loop ordering. Also, to avoid difficulty in interpreting parallel * output, the execution policies use sequential execution. * * RAJA features shown: @@ -28,18 +28,18 @@ // // Define three named loop index integer types used in the triply-nested loops. -// These will trigger compilation errors if lambda index argument ordering +// These will trigger compilation errors if lambda index argument ordering // and types do not match the typed range index ordering. See final // example in this file. // // _raja_typed_indices_start RAJA_INDEX_VALUE_T(KIDX, int, "KIDX"); -RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); -RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); +RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); +RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); // _raja_typed_indices_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { // _range_min_max_start @@ -51,42 +51,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int kmax = 4; // _range_min_max_end -// -// The RAJA variants of the loop nest use the following typed range segments -// based on the typed indices defined above, outside of main(). -// + // + // The RAJA variants of the loop nest use the following typed range segments + // based on the typed indices defined above, outside of main(). + // // _raja_typed_index_ranges_start RAJA::TypedRangeSegment KRange(kmin, kmax); RAJA::TypedRangeSegment JRange(jmin, jmax); RAJA::TypedRangeSegment IRange(imin, imax); // _raja_typed_index_ranges_end - + std::cout << "\n\nRAJA::kernel nested loop reorder example...\n"; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + std::cout + << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_kji_loops_start - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_kji_loops_end -//----------------------------------------------------------------------------// - - std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + + std::cout + << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_kji_loops_start -// clang-format off + // clang-format off using KJI_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::seq_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -97,35 +106,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) + { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); // _raja_kji_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout + << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_jik_loops_start - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_jik_loops_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_jik_loops_start -// clang-format off + // clang-format off using JIK_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // j RAJA::statement::For<0, RAJA::seq_exec, // i @@ -136,36 +152,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) + { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); // _raja_jik_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout + << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_ikj_loops_start - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_ikj_loops_end -//----------------------------------------------------------------------------// - + //----------------------------------------------------------------------------// + std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_ikj_loops_start -// clang-format off + // clang-format off using IKJ_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::seq_exec, // i RAJA::statement::For<2, RAJA::seq_exec, // k @@ -176,17 +199,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + // clang-format on + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) + { printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); // _raja_ikj_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + #if 0 // Enable this code block to generate compiler error. //----------------------------------------------------------------------------// // The following demonstrates that code will not compile if lambda argument @@ -207,4 +229,3 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) return 0; } - diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp index 582b18ce80..f7723bb617 100644 --- a/exercises/launch-matrix-transpose-local-array.cpp +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -69,7 +69,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose example...\n"; @@ -88,8 +88,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -105,8 +105,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -121,8 +123,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -133,14 +137,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -152,19 +159,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -179,10 +188,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _mattranspose_localarray_raja_start - using loop_pol_1 = RAJA::LoopPolicy; + using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -213,7 +222,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _mattranspose_localarray_raja_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -237,10 +246,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// within the omp parallel region. /// - //using loop_pol_2 = RAJA::LoopPolicy; + // using loop_pol_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -272,7 +281,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -285,18 +294,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); /// TODO... /// - /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads directly + /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads + /// directly /// const bool cuda_async = false; - using cuda_launch_policy = RAJA::LaunchPolicy>; + using cuda_launch_policy = + RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -329,19 +340,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -353,13 +364,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); using hip_teams_y = RAJA::LoopPolicy; using hip_teams_x = RAJA::LoopPolicy; @@ -367,10 +379,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using hip_threads_y = RAJA::LoopPolicy; using hip_threads_x = RAJA::LoopPolicy; - const bool hip_async = false; + const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -403,13 +415,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index bb777cd729..b88107f7e7 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -69,7 +69,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose example...\n"; @@ -88,8 +88,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -105,8 +105,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -121,8 +123,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -133,14 +137,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -152,19 +159,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -179,10 +188,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _mattranspose_localarray_raja_start - using loop_pol_1 = RAJA::LoopPolicy; + using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -214,7 +223,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _mattranspose_localarray_raja_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -230,11 +239,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops over tiles sequentially while exposing parallelism on // one of the inner loops. // - using omp_pol_2 = RAJA::LoopPolicy; - using loop_pol_2 = RAJA::LoopPolicy; + using omp_pol_2 = RAJA::LoopPolicy; + using loop_pol_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -266,7 +275,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -279,8 +288,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); using cuda_teams_y = RAJA::LoopPolicy; using cuda_teams_x = RAJA::LoopPolicy; @@ -289,9 +298,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using cuda_threads_x = RAJA::LoopPolicy; const bool cuda_async = false; - using cuda_launch_policy = RAJA::LaunchPolicy>; + using cuda_launch_policy = + RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -324,19 +334,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -348,13 +358,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); using hip_teams_y = RAJA::LoopPolicy; using hip_teams_x = RAJA::LoopPolicy; @@ -362,10 +373,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using hip_threads_y = RAJA::LoopPolicy; using hip_threads_x = RAJA::LoopPolicy; - const bool hip_async = false; + const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -398,13 +409,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp index 7f3e4f377b..e5ca57edb7 100644 --- a/exercises/launch-matrix-transpose-tiled.cpp +++ b/exercises/launch-matrix-transpose-tiled.cpp @@ -60,7 +60,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA tiled matrix transpose example...\n"; @@ -81,8 +81,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -98,12 +98,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// @@ -115,24 +117,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end @@ -151,13 +157,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // global iteration number. // -/// -/// TODO: Uncomment these range segments so you can use them in the -/// non-HIP exercises in this file. -/* - RAJA::TypedRangeSegment row_Range(0, N_r); - RAJA::TypedRangeSegment col_Range(0, N_c); -*/ + /// + /// TODO: Uncomment these range segments so you can use them in the + /// non-HIP exercises in this file. + /* + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); + */ //----------------------------------------------------------------------------// std::cout << "\n Running sequential tiled matrix transpose ...\n"; @@ -169,10 +175,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start - //using loop_pol_1 = RAJA::LoopPolicy; + // using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -203,14 +209,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); // _raja_tiled_mattranspose_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -218,17 +225,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops over tiles sequentially while exposing parallelism on // one of the inner loops. // - //using omp_for_pol_2 = RAJA::LoopPolicy; - //using loop_pol_2 = RAJA::LoopPolicy; + // using omp_for_pol_2 = RAJA::LoopPolicy; + // using loop_pol_2 = RAJA::LoopPolicy; /// /// TODO... /// /// EXERCISE: Create a launch_policy_2 that will create an omp parallel region /// - /// Uncomment the kernel below to run it and check the result. - /// - /// + /// Uncomment the kernel below to run it and check the result. + /// + /// /* // clang-format off @@ -236,9 +243,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, row_Range, [&] +(RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, col_Range, [&] +(RAJA::TypedRangeSegment const &col_tile) { RAJA::loop(ctx, row_tile, [&] (int row) { RAJA::loop(ctx, col_tile, [&] (int col) { @@ -253,14 +262,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); */ -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; @@ -285,41 +294,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Implement the cuda launch policy to dispatch the kernel below /// on the GPU /// - /// When you uncomment kernel code below, you will also need to + /// When you uncomment kernel code below, you will also need to /// uncomment variables above that are used within it. /// -/* -// clang-format off - RAJA::launch( - RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + /* + // clang-format off + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, row_Range, [&] + (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, col_Range, [&] + (RAJA::TypedRangeSegment const &col_tile) { - RAJA::loop(ctx, row_tile, [&] (int row) { - RAJA::loop(ctx, col_tile, [&] (int col) { + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { - Atview(col, row) = Aview(row, col); + Atview(col, row) = Aview(row, col); + }); }); - }); + }); }); - }); - }); -*/ -// clang-format on + }); + */ + // clang-format on checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; @@ -334,13 +345,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); using hip_teams_y = RAJA::LoopPolicy; using hip_teams_x = RAJA::LoopPolicy; @@ -348,10 +360,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using hip_threads_y = RAJA::LoopPolicy; using hip_threads_x = RAJA::LoopPolicy; - const bool hip_async = false; + const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -374,10 +386,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp index fb296721a6..8a070469bf 100644 --- a/exercises/launch-matrix-transpose-tiled_solution.cpp +++ b/exercises/launch-matrix-transpose-tiled_solution.cpp @@ -60,7 +60,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA tiled matrix transpose example...\n"; @@ -81,8 +81,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -98,12 +98,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// @@ -115,30 +117,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { int col = bx * TILE_DIM + tx; // Matrix column index int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// // @@ -163,10 +169,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start - using loop_pol_1 = RAJA::LoopPolicy; + using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch(RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -187,14 +193,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tiled_mattranspose_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -202,11 +209,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops over tiles sequentially while exposing parallelism on // one of the inner loops. // - using omp_for_pol_2 = RAJA::LoopPolicy; - using loop_pol_2 = RAJA::LoopPolicy; + using omp_for_pol_2 = RAJA::LoopPolicy; + using loop_pol_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -228,13 +235,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; @@ -243,9 +250,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); - + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + // _raja_mattranspose_cuda_start using cuda_teams_y = RAJA::LoopPolicy; using cuda_teams_x = RAJA::LoopPolicy; @@ -254,9 +261,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using cuda_threads_x = RAJA::LoopPolicy; const bool cuda_async = false; - using cuda_launch_policy = RAJA::LaunchPolicy>; + using cuda_launch_policy = + RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -279,13 +287,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_cuda_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; @@ -297,13 +305,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); using hip_teams_y = RAJA::LoopPolicy; using hip_teams_x = RAJA::LoopPolicy; @@ -311,10 +320,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using hip_threads_y = RAJA::LoopPolicy; using hip_threads_x = RAJA::LoopPolicy; - const bool hip_async = false; + const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), RAJA::Threads(c_block_sz, r_block_sz)), @@ -337,10 +346,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp index cf504fe0ff..57b3995bda 100644 --- a/exercises/launch-matrix-transpose.cpp +++ b/exercises/launch-matrix-transpose.cpp @@ -48,7 +48,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -64,8 +64,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -81,12 +81,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -94,9 +96,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -108,10 +112,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -122,13 +126,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start - using loop_policy_seq = RAJA::LoopPolicy; + using loop_policy_seq = RAJA::LoopPolicy; using launch_policy_seq = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -146,57 +150,57 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout + << "\n Running openmp matrix transpose - parallel top inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); // // This policy loops sequentially while exposing parallelism on // one of the inner loops. - - //uncomment to use in example below - //using loop_policy_omp = RAJA::LoopPolicy; - using launch_policy_omp = RAJA::LaunchPolicy; - - RAJA::launch(RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { - - - /// TODO... - /// - /// EXERCISE: Implement the loops to apply omp parallism and sequential - /// execution on the column and row loops respectively - /// - - //Atview(col, row) = Aview(row, col); + // uncomment to use in example below + // using loop_policy_omp = RAJA::LoopPolicy; + using launch_policy_omp = RAJA::LaunchPolicy; - }); + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) + { + /// TODO... + /// + /// EXERCISE: Implement the loops to apply omp parallism and sequential + /// execution on the column and row loops respectively + /// + + // Atview(col, row) = Aview(row, col); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start using cuda_thread_x = RAJA::LoopPolicy; using cuda_thread_y = RAJA::LoopPolicy; - const bool async = false; //execute asynchronously + const bool async = false; // execute asynchronously using launch_policy_cuda = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -211,13 +215,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_cuda_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -228,7 +232,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp index 05ef616500..778e489e9b 100644 --- a/exercises/launch-matrix-transpose_solution.cpp +++ b/exercises/launch-matrix-transpose_solution.cpp @@ -48,7 +48,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -64,8 +64,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -81,12 +81,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -94,9 +96,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -108,10 +112,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -122,13 +126,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start - using loop_policy_seq = RAJA::LoopPolicy; + using loop_policy_seq = RAJA::LoopPolicy; using launch_policy_seq = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -143,13 +147,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout + << "\n Running openmp matrix transpose - parallel top inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -157,10 +162,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops sequentially while exposing parallelism on // one of the inner loops. // - using loop_policy_omp = RAJA::LoopPolicy; + using loop_policy_omp = RAJA::LoopPolicy; using launch_policy_omp = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -175,25 +180,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start using cuda_thread_x = RAJA::LoopPolicy; using cuda_thread_y = RAJA::LoopPolicy; - const bool async = false; //execute asynchronously + const bool async = false; // execute asynchronously using launch_policy_cuda = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -208,13 +213,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_mattranspose_cuda_end -// clang-format on + // clang-format on checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -225,7 +230,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp index e5dc7b4f70..b711e1cce1 100644 --- a/exercises/launchintro-execpols.cpp +++ b/exercises/launchintro-execpols.cpp @@ -37,16 +37,17 @@ #if defined(RAJA_ENABLE_CUDA) // _cuda_tensorinit_kernel_start -template< int i_block_size, int j_block_size, int k_block_size > -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,64 +59,71 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// - constexpr int N = 100; + // _init_define_start + // + // 3D tensor has N^3 entries + // + constexpr int N = 100; constexpr int N_tot = N * N * N; - constexpr double c = 0.0001; - double* a = memoryManager::allocate(N_tot); - double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; @@ -129,11 +137,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// the tensor initialization kernel. /// -// _raja_tensorinit_seq_start - //using loop_policy_1 = RAJA::LoopPolicy; + // _raja_tensorinit_seq_start + // using loop_policy_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -146,35 +154,38 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ }); // _raja_tensorinit_seq_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; @@ -188,14 +199,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// kernel that creates a parallel outer loop. /// -// _raja_tensorinit_omp_outer_start + // _raja_tensorinit_omp_outer_start /* using omp_policy_2 = RAJA::LoopPolicy; using loop_policy_2 = RAJA::LoopPolicy; */ using launch_policy_2 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { @@ -213,46 +224,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tensorinit_omp_outer_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); -// _cuda_blockdim_end + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); + // _cuda_blockdim_end -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start - using cuda_teams_z_3 = RAJA::LoopPolicy; + // _raja_tensorinit_cuda_start + using cuda_teams_z_3 = RAJA::LoopPolicy; using cuda_global_thread_y_3 = RAJA::LoopPolicy; using cuda_global_thread_x_3 = RAJA::LoopPolicy; - const bool async_3 = false; + const bool async_3 = false; using launch_policy_3 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -269,19 +280,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); -// clang-format on -// _raja_tensorinit_cuda_end + // clang-format on + // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_tiled_direct_start + // _raja_tensorinit_cuda_tiled_direct_start using cuda_teams_z_4 = RAJA::LoopPolicy; using cuda_teams_y_4 = RAJA::LoopPolicy; using cuda_teams_x_4 = RAJA::LoopPolicy; @@ -289,10 +300,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using cuda_threads_y_4 = RAJA::LoopPolicy; using cuda_threads_x_4 = RAJA::LoopPolicy; - const bool async_4 = false; + const bool async_4 = false; using launch_policy_4 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -320,37 +331,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_cuda_tiled_direct_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); -// clang-format off + // clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); -// clang-format on + // clang-format on nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); -#endif // if defined(RAJA_ENABLE_CUDA) +#endif // if defined(RAJA_ENABLE_CUDA) #if defined(RAJA_ENABLE_HIP) @@ -363,35 +374,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_deviceview_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_deviceview_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start - using hip_teams_z_5 = RAJA::LoopPolicy; + // _raja_tensorinit_hip_start + using hip_teams_z_5 = RAJA::LoopPolicy; using hip_global_thread_y_5 = RAJA::LoopPolicy; using hip_global_thread_x_5 = RAJA::LoopPolicy; - const bool async_5 = false; + const bool async_5 = false; using launch_policy_5 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -409,20 +420,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tensorinit_hip_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_tiled_direct_start + // _raja_tensorinit_hip_tiled_direct_start using hip_teams_z_6 = RAJA::LoopPolicy; using hip_teams_y_6 = RAJA::LoopPolicy; using hip_teams_x_6 = RAJA::LoopPolicy; @@ -430,10 +441,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using hip_threads_y_6 = RAJA::LoopPolicy; using hip_threads_x_6 = RAJA::LoopPolicy; - const bool async_6 = false; + const bool async_6 = false; using launch_policy_6 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -461,16 +472,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_hip_tiled_direct_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); -#endif // if defined(RAJA_ENABLE_HIP) +#endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -489,14 +500,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp index 2439988b06..1a7b52eee0 100644 --- a/exercises/launchintro-execpols_solution.cpp +++ b/exercises/launchintro-execpols_solution.cpp @@ -37,16 +37,17 @@ #if defined(RAJA_ENABLE_CUDA) // _cuda_tensorinit_kernel_start -template< int i_block_size, int j_block_size, int k_block_size > -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,75 +59,82 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// - constexpr int N = 100; + // _init_define_start + // + // 3D tensor has N^3 entries + // + constexpr int N = 100; constexpr int N_tot = N * N * N; - constexpr double c = 0.0001; - double* a = memoryManager::allocate(N_tot); - double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_seq_start - using loop_policy_1 = RAJA::LoopPolicy; + // _raja_tensorinit_seq_start + using loop_policy_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -142,47 +150,50 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_seq_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_outer_start - using omp_policy_2 = RAJA::LoopPolicy; - using loop_policy_2 = RAJA::LoopPolicy; + // _raja_tensorinit_omp_outer_start + using omp_policy_2 = RAJA::LoopPolicy; + using loop_policy_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -198,46 +209,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_omp_outer_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); -// _cuda_blockdim_end + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); + // _cuda_blockdim_end -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start - using cuda_teams_z_3 = RAJA::LoopPolicy; + // _raja_tensorinit_cuda_start + using cuda_teams_z_3 = RAJA::LoopPolicy; using cuda_global_thread_y_3 = RAJA::LoopPolicy; using cuda_global_thread_x_3 = RAJA::LoopPolicy; - const bool async_3 = false; + const bool async_3 = false; using launch_policy_3 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -254,19 +265,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); -// clang-format on -// _raja_tensorinit_cuda_end + // clang-format on + // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_tiled_direct_start + // _raja_tensorinit_cuda_tiled_direct_start using cuda_teams_z_4 = RAJA::LoopPolicy; using cuda_teams_y_4 = RAJA::LoopPolicy; using cuda_teams_x_4 = RAJA::LoopPolicy; @@ -274,10 +285,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using cuda_threads_y_4 = RAJA::LoopPolicy; using cuda_threads_x_4 = RAJA::LoopPolicy; - const bool async_4 = false; + const bool async_4 = false; using launch_policy_4 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -305,37 +316,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_cuda_tiled_direct_end -// clang-format on + // clang-format on checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); -// clang-format off + // clang-format off dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); -// clang-format on + // clang-format on nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); -#endif // if defined(RAJA_ENABLE_CUDA) +#endif // if defined(RAJA_ENABLE_CUDA) #if defined(RAJA_ENABLE_HIP) @@ -348,35 +359,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_deviceview_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_deviceview_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start - using hip_teams_z_5 = RAJA::LoopPolicy; + // _raja_tensorinit_hip_start + using hip_teams_z_5 = RAJA::LoopPolicy; using hip_global_thread_y_5 = RAJA::LoopPolicy; using hip_global_thread_x_5 = RAJA::LoopPolicy; - const bool async_5 = false; + const bool async_5 = false; using launch_policy_5 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -394,20 +405,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // _raja_tensorinit_hip_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_tiled_direct_start + // _raja_tensorinit_hip_tiled_direct_start using hip_teams_z_6 = RAJA::LoopPolicy; using hip_teams_y_6 = RAJA::LoopPolicy; using hip_teams_x_6 = RAJA::LoopPolicy; @@ -415,10 +426,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using hip_threads_y_6 = RAJA::LoopPolicy; using hip_threads_x_6 = RAJA::LoopPolicy; - const bool async_6 = false; + const bool async_6 = false; using launch_policy_6 = RAJA::LaunchPolicy>; -// clang-format off + // clang-format off RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), @@ -446,16 +457,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); // _raja_tensorinit_hip_tiled_direct_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); -#endif // if defined(RAJA_ENABLE_HIP) +#endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -474,14 +485,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp index 960142a83b..41accd3651 100644 --- a/exercises/memoryManager.hpp +++ b/exercises/memoryManager.hpp @@ -28,21 +28,20 @@ namespace memoryManager { #if defined(RAJA_ENABLE_SYCL) - static camp::resources::Resource* sycl_res; +static camp::resources::Resource* sycl_res; #endif -// clang-format off template -T *allocate(RAJA::Index_type size) +T* allocate(RAJA::Index_type size) { - T *ptr; + T* ptr; #if defined(RAJA_ENABLE_CUDA) cudaErrchk( - cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); + cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); + hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) - ptr = sycl_res->allocate(size, camp::resources::MemoryAccess::Managed); + ptr = sycl_res->allocate(size, camp::resources::MemoryAccess::Managed); #else ptr = new T[size]; #endif @@ -50,9 +49,10 @@ T *allocate(RAJA::Index_type size) } template -void deallocate(T *&ptr) +void deallocate(T*& ptr) { - if (ptr) { + if (ptr) + { #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaFree(ptr)); #elif defined(RAJA_ENABLE_HIP) @@ -66,36 +66,38 @@ void deallocate(T *&ptr) } } -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) - template - T *allocate_gpu(RAJA::Index_type size) - { - T *ptr; +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) +template +T* allocate_gpu(RAJA::Index_type size) +{ + T* ptr; #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size)); + cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); + hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) - auto qu = sycl_res->get().get_queue(); - ptr = cl::sycl::malloc_device(size, *qu); + auto qu = sycl_res->get().get_queue(); + ptr = cl::sycl::malloc_device(size, *qu); #endif - return ptr; - } + return ptr; +} - template - void deallocate_gpu(T *&ptr) +template +void deallocate_gpu(T*& ptr) +{ + if (ptr) { - if (ptr) { #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaFree(ptr)); + cudaErrchk(cudaFree(ptr)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipFree(ptr)); + hipErrchk(hipFree(ptr)); #elif defined(RAJA_ENABLE_SYCL) sycl_res->deallocate(ptr); #endif - ptr = nullptr; - } + ptr = nullptr; } +} #endif }; // namespace memoryManager diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp index 058308de74..1864aa28de 100644 --- a/exercises/offset-layout-stencil.cpp +++ b/exercises/offset-layout-stencil.cpp @@ -16,21 +16,21 @@ /* * Offset Layout Stencil Exercise * - * This exercise applies a five-point stencil to the interior cells of a + * This exercise applies a five-point stencil to the interior cells of a * lattice and stores the resulting sums in a second lattice of equal size. - * You can think of the lattice as representing the centers of cells on a - * two-dimensional Cartesian mesh. + * You can think of the lattice as representing the centers of cells on a + * two-dimensional Cartesian mesh. * - * The five-point stencil accumulates values of a cell and its four neighbors. - * Assuming the cells of a lattice may be accessed through a row/col fashion, + * The five-point stencil accumulates values of a cell and its four neighbors. + * Assuming the cells of a lattice may be accessed through a row/col fashion, * the stencil may be expressed as the following sum: - * + * * output(row, col) = input(row, col) + * input(row - 1, col) + input(row + 1, col) + * input(row, col - 1) + input(row, col + 1) * * We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros - * for a lattice of size (N_r + 2) x (N_c + 2). + * for a lattice of size (N_r + 2) x (N_c + 2). * * In the case of N_r = N_c = 3, the input lattice values are: * @@ -60,8 +60,8 @@ * | 0 | 0 | 0 | 0 | 0 | * --------------------- * - * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to - * simplify the indexing to perform the stencil calculation. For the + * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to + * simplify the indexing to perform the stencil calculation. For the * purposes of discussion, we enumerate the lattice in the following manner: * * -------------------------------------------------- @@ -81,12 +81,12 @@ * * RAJA features shown: * - RAJA::kernel kernel execution method and execution policies - * - RAJA::View + * - RAJA::View * - RAJA::Layout * * For the CUDA implementation, we use unified memory to hold the lattice data. * For HIP, we use explicit host-device memory and manually copy data between - * the two. + * the two. */ /* @@ -111,103 +111,105 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nFive-point stencil example...\n"; -// _stencil_define_start -// -// Define num of interior cells in row/cols in a lattice -// + // _stencil_define_start + // + // Define num of interior cells in row/cols in a lattice + // constexpr int N_r = 5; constexpr int N_c = 4; -// -// Define total num of cells in rows/cols in a lattice -// + // + // Define total num of cells in rows/cols in a lattice + // constexpr int totCellsInRow = N_r + 2; constexpr int totCellsInCol = N_c + 2; -// -// Define total num of cells in a lattice -// + // + // Define total num of cells in a lattice + // constexpr int totCells = totCellsInRow * totCellsInCol; -// _stencil_define_end + // _stencil_define_end -// -// Allocate and initialize lattice -// - int* input = memoryManager::allocate(totCells * sizeof(int)); - int* output = memoryManager::allocate(totCells * sizeof(int)); + // + // Allocate and initialize lattice + // + int* input = memoryManager::allocate(totCells * sizeof(int)); + int* output = memoryManager::allocate(totCells * sizeof(int)); int* output_ref = memoryManager::allocate(totCells * sizeof(int)); std::memset(input, 0, totCells * sizeof(int)); std::memset(output, 0, totCells * sizeof(int)); std::memset(output_ref, 0, totCells * sizeof(int)); -// -// C-Style intialization -// -// _stencil_input_init_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { - int id = col + totCellsInCol * row; + // + // C-Style intialization + // + // _stencil_input_init_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { + int id = col + totCellsInCol * row; input[id] = 1; } } -// _stencil_input_init_end + // _stencil_input_init_end - std::cout << "\ninput lattice:\n"; + std::cout << "\ninput lattice:\n"; printLattice(input, totCellsInRow, totCellsInCol); -// -// Generate reference solution -// -// _stencil_output_ref_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { - - int id = col + totCellsInCol * row; - output_ref[id] = input[id] + input[id + 1] - + input[id - 1] - + input[id + totCellsInCol] - + input[id - totCellsInCol]; + // + // Generate reference solution + // + // _stencil_output_ref_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { + + int id = col + totCellsInCol * row; + output_ref[id] = input[id] + input[id + 1] + input[id - 1] + + input[id + totCellsInCol] + input[id - totCellsInCol]; } } -// _stencil_output_ref_end + // _stencil_output_ref_end - std::cout << "\noutput reference lattice:\n"; + std::cout << "\noutput reference lattice:\n"; printLattice(output_ref, totCellsInRow, totCellsInCol); -//----------------------------------------------------------------------------// - -// -// The following code illustrates pairing an offset layout and a RAJA view -// object to simplify multidimensional indexing. -// An offset layout is constructed by using the make_offset_layout method. -// The first argument of the layout is an array object with the coordinates of -// the bottom left corner of the lattice, and the second argument is an array -// object of the coordinates of the top right corner plus 1. -// The example uses double braces to initiate the array object and its -// subobjects. -// + //----------------------------------------------------------------------------// + + // + // The following code illustrates pairing an offset layout and a RAJA view + // object to simplify multidimensional indexing. + // An offset layout is constructed by using the make_offset_layout method. + // The first argument of the layout is an array object with the coordinates of + // the bottom left corner of the lattice, and the second argument is an array + // object of the coordinates of the top right corner plus 1. + // The example uses double braces to initiate the array object and its + // subobjects. + // // _offsetlayout_views_start const int DIM = 2; -// clang-format off + // clang-format off RAJA::OffsetLayout layout = RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); -// clang-format on + // clang-format on RAJA::View> inputView(input, layout); RAJA::View> outputView(output, layout); // _offsetlayout_views_end -// -// Create range segments used in kernels -// + // + // Create range segments used in kernels + // // _offsetlayout_ranges_start RAJA::TypedRangeSegment col_range(0, N_c); RAJA::TypedRangeSegment row_range(0, N_r); // _offsetlayout_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n"; @@ -215,34 +217,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajaseq_start using NESTED_EXEC_POL1 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col RAJA::statement::Lambda<0> > > - >; + >; -// clang-format on + // clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - + [=](int col, int row) + { outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); }); // _offsetlayout_rajaseq_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -260,12 +261,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// earlier tutorial section. /// - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -275,7 +276,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajacuda_start using NESTED_EXEC_POL3 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row @@ -284,28 +285,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > > > - >; + >; -// clang-format on + // clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE(int col, int row) { - + [=] RAJA_DEVICE(int col, int row) + { outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); }); // _offsetlayout_rajacuda_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -315,14 +315,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* d_input = memoryManager::allocate_gpu(totCells * sizeof(int)); int* d_output = memoryManager::allocate_gpu(totCells * sizeof(int)); - hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice)); - RAJA::View> d_inputView (d_input, layout); + RAJA::View> d_inputView(d_input, layout); RAJA::View> d_outputView(d_output, layout); // _offsetlayout_rajahip_start using NESTED_EXEC_POL4 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, //row @@ -333,22 +334,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE(int col, int row) { - + [=] RAJA_DEVICE(int col, int row) + { d_outputView(row, col) = - d_inputView(row, col) - + d_inputView(row - 1, col) - + d_inputView(row + 1, col) - + d_inputView(row, col - 1) - + d_inputView(row, col + 1); + d_inputView(row, col) + + d_inputView(row - 1, col) + + d_inputView(row + 1, col) + + d_inputView(row, col - 1) + + d_inputView(row, col + 1); }); // _offsetlayout_rajahip_end - hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(output, d_output, totCells * sizeof(int), + hipMemcpyDeviceToHost)); - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); @@ -356,11 +358,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_output); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(input); memoryManager::deallocate(output); memoryManager::deallocate(output_ref); @@ -375,8 +377,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void printLattice(int* lattice, int totCellsInRow, int totCellsInCol) { std::cout << std::endl; - for (int row = 0; row < totCellsInRow; ++row) { - for (int col = 0; col < totCellsInCol; ++col) { + for (int row = 0; row < totCellsInRow; ++row) + { + for (int col = 0; col < totCellsInCol; ++col) + { const int id = col + totCellsInCol * row; std::cout << lattice[id] << " "; @@ -394,14 +398,18 @@ void checkResult(int* compLattice, int* refLattice, int totCells) bool correct = true; int i = 0; - while ( correct && (i < totCells) ) { + while (correct && (i < totCells)) + { correct = (compLattice[i] == refLattice[i]); i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp index 79b056bafd..a22dbdf0f7 100644 --- a/exercises/offset-layout-stencil_solution.cpp +++ b/exercises/offset-layout-stencil_solution.cpp @@ -16,21 +16,21 @@ /* * Offset Layout Stencil Exercise * - * This exercise applies a five-point stencil to the interior cells of a + * This exercise applies a five-point stencil to the interior cells of a * lattice and stores the resulting sums in a second lattice of equal size. - * You can think of the lattice as representing the centers of cells on a - * two-dimensional Cartesian mesh. + * You can think of the lattice as representing the centers of cells on a + * two-dimensional Cartesian mesh. * - * The five-point stencil accumulates values of a cell and its four neighbors. - * Assuming the cells of a lattice may be accessed through a row/col fashion, + * The five-point stencil accumulates values of a cell and its four neighbors. + * Assuming the cells of a lattice may be accessed through a row/col fashion, * the stencil may be expressed as the following sum: - * + * * output(row, col) = input(row, col) + * input(row - 1, col) + input(row + 1, col) + * input(row, col - 1) + input(row, col + 1) * * We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros - * for a lattice of size (N_r + 2) x (N_c + 2). + * for a lattice of size (N_r + 2) x (N_c + 2). * * In the case of N_r = N_c = 3, the input lattice values are: * @@ -60,8 +60,8 @@ * | 0 | 0 | 0 | 0 | 0 | * --------------------- * - * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to - * simplify the indexing to perform the stencil calculation. For the + * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to + * simplify the indexing to perform the stencil calculation. For the * purposes of discussion, we enumerate the lattice in the following manner: * * -------------------------------------------------- @@ -81,13 +81,13 @@ * * RAJA features shown: * - RAJA::kernel kernel execution method and execution policies - * - RAJA::View + * - RAJA::View * - RAJA::OffsetLayout * - RAJA::make_offset_layout method * * For the CUDA implementation, we use unified memory to hold the lattice data. * For HIP, we use explicit host-device memory and manually copy data between - * the two. + * the two. */ /* @@ -112,103 +112,105 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nFive-point stencil example...\n"; -// _stencil_define_start -// -// Define num of interior cells in row/cols in a lattice -// + // _stencil_define_start + // + // Define num of interior cells in row/cols in a lattice + // constexpr int N_r = 5; constexpr int N_c = 4; -// -// Define total num of cells in rows/cols in a lattice -// + // + // Define total num of cells in rows/cols in a lattice + // constexpr int totCellsInRow = N_r + 2; constexpr int totCellsInCol = N_c + 2; -// -// Define total num of cells in a lattice -// + // + // Define total num of cells in a lattice + // constexpr int totCells = totCellsInRow * totCellsInCol; -// _stencil_define_end + // _stencil_define_end -// -// Allocate and initialize lattice -// - int* input = memoryManager::allocate(totCells * sizeof(int)); - int* output = memoryManager::allocate(totCells * sizeof(int)); + // + // Allocate and initialize lattice + // + int* input = memoryManager::allocate(totCells * sizeof(int)); + int* output = memoryManager::allocate(totCells * sizeof(int)); int* output_ref = memoryManager::allocate(totCells * sizeof(int)); std::memset(input, 0, totCells * sizeof(int)); std::memset(output, 0, totCells * sizeof(int)); std::memset(output_ref, 0, totCells * sizeof(int)); -// -// C-Style intialization -// -// _stencil_input_init_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { - int id = col + totCellsInCol * row; + // + // C-Style intialization + // + // _stencil_input_init_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { + int id = col + totCellsInCol * row; input[id] = 1; } } -// _stencil_input_init_end + // _stencil_input_init_end - std::cout << "\ninput lattice:\n"; + std::cout << "\ninput lattice:\n"; printLattice(input, totCellsInRow, totCellsInCol); -// -// Generate reference solution -// -// _stencil_output_ref_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { - - int id = col + totCellsInCol * row; - output_ref[id] = input[id] + input[id + 1] - + input[id - 1] - + input[id + totCellsInCol] - + input[id - totCellsInCol]; + // + // Generate reference solution + // + // _stencil_output_ref_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { + + int id = col + totCellsInCol * row; + output_ref[id] = input[id] + input[id + 1] + input[id - 1] + + input[id + totCellsInCol] + input[id - totCellsInCol]; } } -// _stencil_output_ref_end + // _stencil_output_ref_end - std::cout << "\noutput reference lattice:\n"; + std::cout << "\noutput reference lattice:\n"; printLattice(output_ref, totCellsInRow, totCellsInCol); -//----------------------------------------------------------------------------// - -// -// The following code illustrates pairing an offset layout and a RAJA view -// object to simplify multidimensional indexing. -// An offset layout is constructed by using the make_offset_layout method. -// The first argument of the layout is an array object with the coordinates of -// the bottom left corner of the lattice, and the second argument is an array -// object of the coordinates of the top right corner plus 1. -// The example uses double braces to initiate the array object and its -// subobjects. -// + //----------------------------------------------------------------------------// + + // + // The following code illustrates pairing an offset layout and a RAJA view + // object to simplify multidimensional indexing. + // An offset layout is constructed by using the make_offset_layout method. + // The first argument of the layout is an array object with the coordinates of + // the bottom left corner of the lattice, and the second argument is an array + // object of the coordinates of the top right corner plus 1. + // The example uses double braces to initiate the array object and its + // subobjects. + // // _offsetlayout_views_start const int DIM = 2; -// clang-format off + // clang-format off RAJA::OffsetLayout layout = RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); -// clang-format on + // clang-format on RAJA::View> inputView(input, layout); RAJA::View> outputView(output, layout); // _offsetlayout_views_end -// -// Create range segments used in kernels -// + // + // Create range segments used in kernels + // // _offsetlayout_ranges_start RAJA::TypedRangeSegment col_range(0, N_c); RAJA::TypedRangeSegment row_range(0, N_r); // _offsetlayout_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n"; @@ -216,34 +218,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajaseq_start using NESTED_EXEC_POL1 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // row RAJA::statement::For<0, RAJA::seq_exec, // col RAJA::statement::Lambda<0> > > - >; + >; -// clang-format on + // clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - + [=](int col, int row) + { outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); }); // _offsetlayout_rajaseq_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -252,8 +253,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(output, 0, totCells * sizeof(int)); // _offsetlayout_rajaomp_start - using NESTED_EXEC_POL2 = -// clang-format off + using NESTED_EXEC_POL2 = + // clang-format off RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col @@ -261,26 +262,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - + [=](int col, int row) + { outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); }); // _offsetlayout_rajaomp_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -290,7 +290,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_rajacuda_start using NESTED_EXEC_POL3 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row @@ -299,28 +299,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > > > - >; + >; -// clang-format on + // clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE(int col, int row) { - + [=] RAJA_DEVICE(int col, int row) + { outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); }); // _offsetlayout_rajacuda_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -332,15 +331,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* d_input = memoryManager::allocate_gpu(totCells); int* d_output = memoryManager::allocate_gpu(totCells); - hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_output, output, totCells * sizeof(int), + hipMemcpyHostToDevice)); - RAJA::View> d_inputView (d_input, layout); + RAJA::View> d_inputView(d_input, layout); RAJA::View> d_outputView(d_output, layout); // _offsetlayout_rajahip_start using NESTED_EXEC_POL4 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, //row @@ -351,22 +352,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE(int col, int row) { - + [=] RAJA_DEVICE(int col, int row) + { d_outputView(row, col) = - d_inputView(row, col) - + d_inputView(row - 1, col) - + d_inputView(row + 1, col) - + d_inputView(row, col - 1) - + d_inputView(row, col + 1); + d_inputView(row, col) + + d_inputView(row - 1, col) + + d_inputView(row + 1, col) + + d_inputView(row, col - 1) + + d_inputView(row, col + 1); }); // _offsetlayout_rajahip_end - hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(output, d_output, totCells * sizeof(int), + hipMemcpyDeviceToHost)); - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); @@ -374,11 +376,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_output); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(input); memoryManager::deallocate(output); memoryManager::deallocate(output_ref); @@ -393,8 +395,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void printLattice(int* lattice, int totCellsInRow, int totCellsInCol) { std::cout << std::endl; - for (int row = 0; row < totCellsInRow; ++row) { - for (int col = 0; col < totCellsInCol; ++col) { + for (int row = 0; row < totCellsInRow; ++row) + { + for (int col = 0; col < totCellsInCol; ++col) + { const int id = col + totCellsInCol * row; std::cout << lattice[id] << " "; @@ -412,14 +416,18 @@ void checkResult(int* compLattice, int* refLattice, int totCells) bool correct = true; int i = 0; - while ( correct && (i < totCells) ) { + while (correct && (i < totCells)) + { correct = (compLattice[i] == refLattice[i]); i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp index 2ba290d27d..dcdbec6bdf 100644 --- a/exercises/permuted-layout-batch-matrix-multiply.cpp +++ b/exercises/permuted-layout-batch-matrix-multiply.cpp @@ -75,77 +75,77 @@ constexpr int HIP_BLOCK_SIZE = 256; #endif // -//Function for checking results +// Function for checking results // template void checkResult(T C, int nMat, int nRows, int nCols); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA batched matrix multiplication exercise...\n"; -// Dimensions of matrices + // Dimensions of matrices constexpr int N_c = 3; constexpr int N_r = 3; -// Number of matrices + // Number of matrices constexpr int N = 8000000; -// Number of iterations + // Number of iterations constexpr int NITER = 20; std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n"; -// -// Initialize a RAJA timer object -// and variable to store minimum run time -// - auto timer = RAJA::Timer(); + // + // Initialize a RAJA timer object + // and variable to store minimum run time + // + auto timer = RAJA::Timer(); double minRun = std::numeric_limits::max(); -// -// Allocate space for data in layout 1 -// - double *A = memoryManager::allocate(N_c * N_r * N); - double *B = memoryManager::allocate(N_c * N_r * N); - double *C = memoryManager::allocate(N_c * N_r * N); - -// -// Layout 1 -// -// make_permuted_layout takes the number of entries in each dimension and a -// templated array indicating index arguments with slowest to fastest stride. -// Standard C++ arrays are used to hold the number of entries in each component. -// This example uses double braces to initalize the array and its subobjects. -// The layout object will index into the array as the following C macro would -// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. -// -// RAJA::Layout objects may be templated on dimension, argument type, and -// index with unit stride. Here, the column index has unit stride (argument 2). -// + // + // Allocate space for data in layout 1 + // + double* A = memoryManager::allocate(N_c * N_r * N); + double* B = memoryManager::allocate(N_c * N_r * N); + double* C = memoryManager::allocate(N_c * N_r * N); + + // + // Layout 1 + // + // make_permuted_layout takes the number of entries in each dimension and a + // templated array indicating index arguments with slowest to fastest stride. + // Standard C++ arrays are used to hold the number of entries in each + // component. This example uses double braces to initalize the array and its + // subobjects. The layout object will index into the array as the following C + // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. + // + // RAJA::Layout objects may be templated on dimension, argument type, and + // index with unit stride. Here, the column index has unit stride (argument + // 2). + // // _permutedlayout_defviews_start std::array perm1 {{0, 1, 2}}; - auto layout1 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 ); + auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1); RAJA::View> Aview(A, layout1); RAJA::View> Bview(B, layout1); RAJA::View> Cview(C, layout1); // _permutedlayout_defviews_end -// -// Allocate space for data in layout 2 -// - double *A2 = memoryManager::allocate(N_c * N_r * N); - double *B2 = memoryManager::allocate(N_c * N_r * N); - double *C2 = memoryManager::allocate(N_c * N_r * N); + // + // Allocate space for data in layout 2 + // + double* A2 = memoryManager::allocate(N_c * N_r * N); + double* B2 = memoryManager::allocate(N_c * N_r * N); + double* C2 = memoryManager::allocate(N_c * N_r * N); -// -// Permuted layout - equivalent to indexing using the following macro -// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] -// In this case the element index has unit stride (argument 0). -// + // + // Permuted layout - equivalent to indexing using the following macro + // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] + // In this case the element index has unit stride (argument 0). + // /// /// TODO... @@ -158,13 +158,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// Then, create views for the A2, B2, C2 arrays using the /// layout object; i.e., Aview2, Bview2, and Cview2. /// - /// Hint: You will the same indexing to access the array data - /// via the Views as for the Views above which are created + /// Hint: You will the same indexing to access the array data + /// via the Views as for the Views above which are created /// using the layout1 View (see kernels in the code below). /// - /// When you are done with the Views, test them out by + /// When you are done with the Views, test them out by /// uncommenting the kernels in the code below that use the - /// the Aview2, Bview2, and Cview2 views. + /// the Aview2, Bview2, and Cview2 views. /// // @@ -179,7 +179,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using INIT_POL = RAJA::seq_exec; #endif -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { for (int row = 0; row < N_r; ++row) { for (int col = 0; col < N_c; ++col) { @@ -194,19 +194,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } }); -// clang-format on + // clang-format on -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - sequential) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_loop_start -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -242,77 +243,77 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_loop_end -// clang-format on + // clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - sequential) ... " << std::endl; -/* - timer.start(); - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - // _permutedlayout2_batchedmatmult_loop_start -// clang-format off - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); + /* + timer.start(); + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + // _permutedlayout2_batchedmatmult_loop_start + // clang-format off + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - } - ); - // _permutedlayout2_batchedmatmult_loop_end -// clang-format on - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); -*/ + } + ); + // _permutedlayout2_batchedmatmult_loop_end + // clang-format on + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + */ -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -322,11 +323,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_omp_start -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -363,80 +365,81 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_omp_end -// clang-format on + // clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - omp parallel for) ... " << std::endl; std::memset(C2, 0, N_c * N_r * N * sizeof(double)); -/* - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); -// clang-format off - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { + /* + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + // clang-format off + RAJA::forall(RAJA::TypedRangeSegment(0, + N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - } - ); - timer.stop(); -// clang-format on + } + ); + timer.stop(); + // clang-format on - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); -*/ + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -446,10 +449,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -486,17 +490,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); -// clang-format on + // clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - cuda) ... " << std::endl; @@ -509,7 +513,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); // clang-format off - RAJA::forall>(RAJA::TypedRangeSegment(0, N), + RAJA::forall>(RAJA::TypedRangeSegment(0, +N), [=] RAJA_DEVICE(int e) { Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) @@ -556,29 +561,32 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - hip) ... " << std::endl; - double *d_A = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_A = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_B = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_C = memoryManager::allocate_gpu(N_c * N_r * N); RAJA::View> d_Aview(d_A, layout1); RAJA::View> d_Bview(d_B, layout1); RAJA::View> d_Cview(d_C, layout1); - hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -615,26 +623,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); -// clang-format on + // clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost)); - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate_gpu(d_A); memoryManager::deallocate_gpu(d_B); memoryManager::deallocate_gpu(d_C); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - hip) ... " << std::endl; @@ -648,15 +657,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Bview2(d_B2, layout2); RAJA::View> d_Cview2(d_C2, layout2); - hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), +hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * +sizeof(double), hipMemcpyHostToDevice )); minRun = std::numeric_limits::max(); for (int i = 0; i < NITER; ++i) { timer.start(); // clang-format off - RAJA::forall>(RAJA::TypedRangeSegment(0, N), + RAJA::forall>(RAJA::TypedRangeSegment(0, +N), [=] RAJA_DEVICE(int e) { d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) @@ -699,7 +710,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); } - hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), +hipMemcpyDeviceToHost )); std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); @@ -713,11 +725,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp index 9d2f707907..b02178482f 100644 --- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp +++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp @@ -76,81 +76,80 @@ constexpr int HIP_BLOCK_SIZE = 256; #endif // -//Function for checking results +// Function for checking results // template void checkResult(T C, int nMat, int nRows, int nCols); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA batched matrix multiplication exercise...\n"; -// Dimensions of matrices + // Dimensions of matrices constexpr int N_c = 3; constexpr int N_r = 3; -// Number of matrices + // Number of matrices constexpr int N = 8000000; -// Number of iterations + // Number of iterations constexpr int NITER = 20; std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n"; -// -// Initialize a RAJA timer object -// and variable to store minimum run time -// - auto timer = RAJA::Timer(); + // + // Initialize a RAJA timer object + // and variable to store minimum run time + // + auto timer = RAJA::Timer(); double minRun = std::numeric_limits::max(); -// -// Allocate space for data in layout 1 -// - double *A = memoryManager::allocate(N_c * N_r * N); - double *B = memoryManager::allocate(N_c * N_r * N); - double *C = memoryManager::allocate(N_c * N_r * N); - -// -// Layout 1 -// -// make_permuted_layout takes the number of entries in each dimension and a -// templated array indicating index arguments with slowest to fastest stride. -// Standard C++ arrays are used to hold the number of entries in each component. -// This example uses double braces to initalize the array and its subobjects. -// The layout object will index into the array as the following C macro would -// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. -// -// RAJA::Layout objects may be templated on dimension, argument type, and -// index with unit stride. Here, the column index has unit stride (argument 2). -// + // + // Allocate space for data in layout 1 + // + double* A = memoryManager::allocate(N_c * N_r * N); + double* B = memoryManager::allocate(N_c * N_r * N); + double* C = memoryManager::allocate(N_c * N_r * N); + + // + // Layout 1 + // + // make_permuted_layout takes the number of entries in each dimension and a + // templated array indicating index arguments with slowest to fastest stride. + // Standard C++ arrays are used to hold the number of entries in each + // component. This example uses double braces to initalize the array and its + // subobjects. The layout object will index into the array as the following C + // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. + // + // RAJA::Layout objects may be templated on dimension, argument type, and + // index with unit stride. Here, the column index has unit stride (argument + // 2). + // // _permutedlayout_defviews_start std::array perm1 {{0, 1, 2}}; - auto layout1 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 ); + auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1); RAJA::View> Aview(A, layout1); RAJA::View> Bview(B, layout1); RAJA::View> Cview(C, layout1); // _permutedlayout_defviews_end -// -// Allocate space for data in layout 2 -// - double *A2 = memoryManager::allocate(N_c * N_r * N); - double *B2 = memoryManager::allocate(N_c * N_r * N); - double *C2 = memoryManager::allocate(N_c * N_r * N); - -// -// Permuted layout - equivalent to indexing using the following macro -// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] -// In this case the element index has unit stride (argument 0). -// + // + // Allocate space for data in layout 2 + // + double* A2 = memoryManager::allocate(N_c * N_r * N); + double* B2 = memoryManager::allocate(N_c * N_r * N); + double* C2 = memoryManager::allocate(N_c * N_r * N); + + // + // Permuted layout - equivalent to indexing using the following macro + // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] + // In this case the element index has unit stride (argument 0). + // // _permutedlayout_permviews_start std::array perm2 {{1, 2, 0}}; - auto layout2 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 ); + auto layout2 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm2); RAJA::View> Aview2(A2, layout2); RAJA::View> Bview2(B2, layout2); @@ -169,7 +168,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using INIT_POL = RAJA::seq_exec; #endif -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { for (int row = 0; row < N_r; ++row) { for (int col = 0; col < N_c; ++col) { @@ -184,19 +183,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } }); -// clang-format on + // clang-format on -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - sequential) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_loop_start -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -232,28 +232,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_loop_end -// clang-format on + // clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - sequential) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout2_batchedmatmult_loop_start -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -290,17 +291,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout2_batchedmatmult_loop_end -// clang-format on + // clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -310,11 +311,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_omp_start -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -351,18 +353,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); // _permutedlayout_batchedmatmult_omp_end -// clang-format on + // clang-format on timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - omp parallel for) ... " << std::endl; @@ -370,10 +372,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C2, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { @@ -410,19 +413,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); -// clang-format on + // clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -432,10 +435,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -472,17 +476,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); -// clang-format on + // clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - cuda) ... " << std::endl; @@ -490,10 +494,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C2, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -530,30 +535,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); -// clang-format on + // clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - hip) ... " << std::endl; - double *d_A = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_A = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_B = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_C = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_A2 = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B2 = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C2 = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_A2 = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_B2 = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_C2 = memoryManager::allocate_gpu(N_c * N_r * N); RAJA::View> d_Aview(d_A, layout1); RAJA::View> d_Bview(d_B, layout1); @@ -563,16 +568,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Bview2(d_B2, layout2); RAJA::View> d_Cview2(d_C2, layout2); - hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_A2, A2, N_c * N_r * N * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_B2, B2, N_c * N_r * N * sizeof(double), + hipMemcpyHostToDevice)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -609,28 +619,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); -// clang-format on + // clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost)); - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - hip) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { @@ -667,16 +679,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } ); timer.stop(); -// clang-format on + // clang-format on RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C2, d_C2, N_c * N_r * N * sizeof(double), + hipMemcpyDeviceToHost)); - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); memoryManager::deallocate_gpu(d_A); @@ -687,11 +700,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_C2); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp index 4c6b90c063..0073297555 100644 --- a/exercises/reductions.cpp +++ b/exercises/reductions.cpp @@ -32,7 +32,7 @@ Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//constexpr int CUDA_BLOCK_SIZE = 256; +// constexpr int CUDA_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_HIP) @@ -45,54 +45,58 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { - a[i] = -1; + } + else + { + a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; - a[minloc_ref] = -100; + a[minloc_ref] = -100; constexpr int maxloc_ref = N / 2 + 1; - a[maxloc_ref] = 100; + a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start -//RAJA::TypedRangeSegment arange(0, N); - // _reductions_range_end + // RAJA::TypedRangeSegment arange(0, N); + // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; @@ -101,7 +105,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Define EXEC_POL1 and REDCUE_POL1 for executing sequentially. /// - + /// TODO... /// /// EXERCISE: Remove comments for remainder of sequential section. @@ -112,11 +116,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum seq_sum(0); RAJA::ReduceMin seq_min(std::numeric_limits::max()); RAJA::ReduceMax seq_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc seq_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc seq_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMinLoc + seq_minloc(std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc + seq_maxloc(std::numeric_limits::min(), -1); RAJA::forall(arange, [=](int i) { - + seq_sum += a[i]; seq_min.min(a[i]); @@ -130,14 +136,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tsum = " << seq_sum.get() << std::endl; std::cout << "\tmin = " << seq_min.get() << std::endl; std::cout << "\tmax = " << seq_max.get() << std::endl; - std::cout << "\tmin, loc = " << seq_minloc.get() << " , " + std::cout << "\tmin, loc = " << seq_minloc.get() << " , " << seq_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " + std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " << seq_maxloc.getLoc() << std::endl; */ - -//----------------------------------------------------------------------------// + + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; @@ -152,7 +158,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise. + /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this + /// exercise. /// /// Uncomment 'arange' variable above so it can be used in kernel. /// @@ -181,12 +188,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tmin, loc = " << omp_minloc.get() << " , " << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.get() << " , " - << omp_maxloc.getLoc() << std::endl; + << omp_maxloc.getLoc() << std::endl; */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; @@ -200,7 +207,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise. + /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this + /// exercise. /// /// Uncomment 'arange' variable above so it can be used in kernel. /// @@ -232,7 +240,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; @@ -240,7 +248,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::TypedRangeSegment arange1(0, N); int* d_a = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); // _reductions_raja_hippolicy_start using EXEC_POL3 = RAJA::hip_exec; @@ -250,40 +258,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum hip_sum(0); RAJA::ReduceMin hip_min(std::numeric_limits::max()); RAJA::ReduceMax hip_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc hip_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc hip_maxloc(std::numeric_limits::min(), -1); - - RAJA::forall(arange1, [=] RAJA_DEVICE (int i) { + RAJA::ReduceMinLoc hip_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc hip_maxloc( + std::numeric_limits::min(), -1); - hip_sum += d_a[i]; + RAJA::forall(arange1, + [=] RAJA_DEVICE(int i) + { + hip_sum += d_a[i]; - hip_min.min(d_a[i]); - hip_max.max(d_a[i]); + hip_min.min(d_a[i]); + hip_max.max(d_a[i]); - hip_minloc.minloc(d_a[i], i); - hip_maxloc.maxloc(d_a[i], i); - - }); + hip_minloc.minloc(d_a[i], i); + hip_maxloc.maxloc(d_a[i], i); + }); std::cout << "\tsum = " << hip_sum.get() << std::endl; std::cout << "\tmin = " << hip_min.get() << std::endl; std::cout << "\tmax = " << hip_max.get() << std::endl; std::cout << "\tmin, loc = " << hip_minloc.get() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.get() << " , " - << hip_maxloc.getLoc() << std::endl; + << hip_maxloc.getLoc() << std::endl; memoryManager::deallocate_gpu(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; - + return 0; } diff --git a/exercises/reductions_solution.cpp b/exercises/reductions_solution.cpp index 6da731e62e..eb753caf0a 100644 --- a/exercises/reductions_solution.cpp +++ b/exercises/reductions_solution.cpp @@ -45,90 +45,96 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { - a[i] = -1; + } + else + { + a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; - a[minloc_ref] = -100; + a[minloc_ref] = -100; constexpr int maxloc_ref = N / 2 + 1; - a[maxloc_ref] = 100; + a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::TypedRangeSegment arange(0, N); // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; // _reductions_raja_seq_start using EXEC_POL1 = RAJA::seq_exec; using REDUCE_POL1 = RAJA::seq_reduce; - + RAJA::ReduceSum seq_sum(0); RAJA::ReduceMin seq_min(std::numeric_limits::max()); RAJA::ReduceMax seq_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc seq_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc seq_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMinLoc seq_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc seq_maxloc( + std::numeric_limits::min(), -1); - RAJA::forall(arange, [=](int i) { - - seq_sum += a[i]; + RAJA::forall(arange, + [=](int i) + { + seq_sum += a[i]; - seq_min.min(a[i]); - seq_max.max(a[i]); + seq_min.min(a[i]); + seq_max.max(a[i]); - seq_minloc.minloc(a[i], i); - seq_maxloc.maxloc(a[i], i); - - }); + seq_minloc.minloc(a[i], i); + seq_maxloc.maxloc(a[i], i); + }); std::cout << "\tsum = " << seq_sum.get() << std::endl; std::cout << "\tmin = " << seq_min.get() << std::endl; std::cout << "\tmax = " << seq_max.get() << std::endl; - std::cout << "\tmin, loc = " << seq_minloc.get() << " , " - << seq_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " - << seq_maxloc.getLoc() << std::endl; + std::cout << "\tmin, loc = " << seq_minloc.get() << " , " + << seq_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " + << seq_maxloc.getLoc() << std::endl; // _reductions_raja_seq_end - -//----------------------------------------------------------------------------// + + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; @@ -141,32 +147,34 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum omp_sum(0); RAJA::ReduceMin omp_min(std::numeric_limits::max()); RAJA::ReduceMax omp_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc omp_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc omp_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMinLoc omp_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc omp_maxloc( + std::numeric_limits::min(), -1); - RAJA::forall(arange, [=](int i) { + RAJA::forall(arange, + [=](int i) + { + omp_sum += a[i]; - omp_sum += a[i]; + omp_min.min(a[i]); + omp_max.max(a[i]); - omp_min.min(a[i]); - omp_max.max(a[i]); - - omp_minloc.minloc(a[i], i); - omp_maxloc.maxloc(a[i], i); - - }); + omp_minloc.minloc(a[i], i); + omp_maxloc.maxloc(a[i], i); + }); std::cout << "\tsum = " << omp_sum.get() << std::endl; std::cout << "\tmin = " << omp_min.get() << std::endl; std::cout << "\tmax = " << omp_max.get() << std::endl; std::cout << "\tmin, loc = " << omp_minloc.get() << " , " - << omp_minloc.getLoc() << std::endl; + << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.get() << " , " - << omp_maxloc.getLoc() << std::endl; + << omp_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; @@ -179,37 +187,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum cuda_sum(0); RAJA::ReduceMin cuda_min(std::numeric_limits::max()); RAJA::ReduceMax cuda_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc cuda_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc cuda_maxloc(std::numeric_limits::min(), -1); - - RAJA::forall(arange, [=] RAJA_DEVICE (int i) { + RAJA::ReduceMinLoc cuda_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc cuda_maxloc( + std::numeric_limits::min(), -1); - cuda_sum += a[i]; + RAJA::forall(arange, + [=] RAJA_DEVICE(int i) + { + cuda_sum += a[i]; - cuda_min.min(a[i]); - cuda_max.max(a[i]); + cuda_min.min(a[i]); + cuda_max.max(a[i]); - cuda_minloc.minloc(a[i], i); - cuda_maxloc.maxloc(a[i], i); - - }); + cuda_minloc.minloc(a[i], i); + cuda_maxloc.maxloc(a[i], i); + }); std::cout << "\tsum = " << cuda_sum.get() << std::endl; std::cout << "\tmin = " << cuda_min.get() << std::endl; std::cout << "\tmax = " << cuda_max.get() << std::endl; std::cout << "\tmin, loc = " << cuda_minloc.get() << " , " - << cuda_minloc.getLoc() << std::endl; + << cuda_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << cuda_maxloc.get() << " , " - << cuda_maxloc.getLoc() << std::endl; + << cuda_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; int* d_a = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); // _reductions_raja_hippolicy_start using EXEC_POL3 = RAJA::hip_exec; @@ -219,40 +229,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum hip_sum(0); RAJA::ReduceMin hip_min(std::numeric_limits::max()); RAJA::ReduceMax hip_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc hip_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc hip_maxloc(std::numeric_limits::min(), -1); - - RAJA::forall(arange, [=] RAJA_DEVICE (int i) { - - hip_sum += d_a[i]; + RAJA::ReduceMinLoc hip_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc hip_maxloc( + std::numeric_limits::min(), -1); - hip_min.min(d_a[i]); - hip_max.max(d_a[i]); + RAJA::forall(arange, + [=] RAJA_DEVICE(int i) + { + hip_sum += d_a[i]; - hip_minloc.minloc(d_a[i], i); - hip_maxloc.maxloc(d_a[i], i); + hip_min.min(d_a[i]); + hip_max.max(d_a[i]); - }); + hip_minloc.minloc(d_a[i], i); + hip_maxloc.maxloc(d_a[i], i); + }); std::cout << "\tsum = " << hip_sum.get() << std::endl; std::cout << "\tmin = " << hip_min.get() << std::endl; std::cout << "\tmax = " << hip_max.get() << std::endl; std::cout << "\tmin, loc = " << hip_minloc.get() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.get() << " , " - << hip_maxloc.getLoc() << std::endl; + << hip_maxloc.getLoc() << std::endl; memoryManager::deallocate_gpu(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; - + return 0; } diff --git a/exercises/scan.cpp b/exercises/scan.cpp index c0c8ef7182..6106eb0115 100644 --- a/exercises/scan.cpp +++ b/exercises/scan.cpp @@ -42,11 +42,11 @@ Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//constexpr int CUDA_BLOCK_SIZE = 16; +// constexpr int CUDA_BLOCK_SIZE = 16; #endif #if defined(RAJA_ENABLE_HIP) -//constexpr int HIP_BLOCK_SIZE = 16; +// constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -68,15 +68,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA scan example...\n"; // _scan_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// - int* in = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); std::iota(in, in + N, -1); @@ -87,11 +87,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_array_init_end - -//----------------------------------------------------------------------------// -// Perform various sequential scans to illustrate inclusive/exclusive, -// in-place, default scans with different operators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential scans to illustrate inclusive/exclusive, + // in-place, default scans with different operators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (default)...\n"; @@ -99,7 +98,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec - /// execution policy type. + /// execution policy type. /// /// NOTE: We've done this one for you to help you get started... /// @@ -113,7 +112,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (plus)...\n"; @@ -123,14 +122,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan (plus)...\n"; @@ -140,14 +139,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an exclusive RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n"; @@ -157,14 +156,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit minimum operator. + /// execution policy type and an explicit minimum operator. /// CHECK_INC_SCAN_RESULTS(OP_MIN_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n"; @@ -174,7 +173,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit maximum operator. + /// execution policy type and an explicit maximum operator. /// CHECK_EXC_SCAN_RESULTS(OP_MAX_INT) @@ -184,24 +183,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP inclusive_scan (plus)...\n"; /// /// TODO... /// - /// EXERCISE: Implement an inclusive RAJA scan with RAJA::omp_parallel_for_exec - /// execution policy type and an explicit plus operator. + /// EXERCISE: Implement an inclusive RAJA scan with + /// RAJA::omp_parallel_for_exec + /// execution policy type and an explicit plus operator. /// CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n"; @@ -210,8 +210,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::omp_parallel_for_exec - /// execution policy type and an explicit plus operator. + /// EXERCISE: Implement an exclusive inplace RAJA scan with + /// RAJA::omp_parallel_for_exec + /// execution policy type and an explicit plus operator. /// CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) @@ -220,13 +221,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a couple of CUDA scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of CUDA scans... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n"; @@ -246,7 +247,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n"; @@ -266,7 +267,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan (plus)...\n"; @@ -288,57 +289,57 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n"; std::copy_n(in, N, out); - int* d_in = memoryManager::allocate_gpu(N); + int* d_in = memoryManager::allocate_gpu(N); int* d_out = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::hip_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top /// of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP exclusive_scan (plus)...\n"; - hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement an exclusive RAJA scan with RAJA::hip_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top /// of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -349,11 +350,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp index 7338260c1e..c1df5e9e33 100644 --- a/exercises/scan_solution.cpp +++ b/exercises/scan_solution.cpp @@ -42,11 +42,11 @@ Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) - constexpr int CUDA_BLOCK_SIZE = 16; +constexpr int CUDA_BLOCK_SIZE = 16; #endif #if defined(RAJA_ENABLE_HIP) - constexpr int HIP_BLOCK_SIZE = 16; +constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -68,15 +68,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA scan example...\n"; // _scan_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// - int* in = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); std::iota(in, in + N, -1); @@ -87,11 +87,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_array_init_end - -//----------------------------------------------------------------------------// -// Perform various sequential scans to illustrate inclusive/exclusive, -// in-place, default scans with different operators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential scans to illustrate inclusive/exclusive, + // in-place, default scans with different operators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (default)...\n"; @@ -104,71 +103,71 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (plus)...\n"; std::copy_n(in, N, out); // _scan_inclusive_seq_plus_start -// clang-format off + // clang-format off RAJA::inclusive_scan(RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_inclusive_seq_plus_end -// clang-format on + // clang-format on CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan (plus)...\n"; std::copy_n(in, N, out); // _scan_exclusive_seq_plus_start -// clang-format off + // clang-format off RAJA::exclusive_scan(RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_seq_plus_end -// clang-format on + // clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n"; // _scan_inclusive_inplace_seq_min_start std::copy_n(in, N, out); -// clang-format off + // clang-format off RAJA::inclusive_scan_inplace(RAJA::make_span(out, N), RAJA::operators::minimum{}); // _scan_inclusive_inplace_seq_min_end -// clang-format on + // clang-format on CHECK_INC_SCAN_RESULTS(OP_MIN_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n"; std::copy_n(in, N, out); // _scan_exclusive_inplace_seq_max_start -// clang-format off + // clang-format off RAJA::exclusive_scan_inplace(RAJA::make_span(out, N), RAJA::operators::maximum{}); // _scan_exclusive_inplace_seq_max_end -// clang-format on + // clang-format on CHECK_EXC_SCAN_RESULTS(OP_MAX_INT) printArray(out, N); @@ -177,37 +176,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP inclusive_scan (plus)...\n"; // _scan_inclusive_omp_plus_start -// clang-format off + // clang-format off RAJA::inclusive_scan(RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_inclusive_omp_plus_end -// clang-format on + // clang-format on CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n"; std::copy_n(in, N, out); // _scan_exclusive_inplace_omp_plus_start -// clang-format off + // clang-format off RAJA::exclusive_scan_inplace( RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_inplace_omp_plus_end -// clang-format on + // clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -215,62 +214,62 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a few CUDA scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a few CUDA scans... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n"; std::copy_n(in, N, out); // _scan_inclusive_inplace_cuda_plus_start -// clang-format off + // clang-format off RAJA::inclusive_scan_inplace>( RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_inclusive_inplace_cuda_plus_end -// clang-format on + // clang-format on CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n"; std::copy_n(in, N, out); // _scan_exclusive_inplace_cuda_plus_start -// clang-format off + // clang-format off RAJA::exclusive_scan_inplace>( RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_inplace_cuda_plus_end -// clang-format on + // clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan (plus)...\n"; std::copy_n(in, N, out); // _scan_exclusive_cuda_plus_start -// clang-format off + // clang-format off RAJA::exclusive_scan>( RAJA::make_span(in, N), RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_cuda_plus_end -// clang-format on + // clang-format on CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -278,52 +277,52 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n"; std::copy_n(in, N, out); - int* d_in = memoryManager::allocate_gpu(N); + int* d_in = memoryManager::allocate_gpu(N); int* d_out = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); // _scan_inclusive_inplace_hip_plus_start -// clang-format off + // clang-format off RAJA::inclusive_scan_inplace>( RAJA::make_span(d_out, N), RAJA::operators::plus{}); // _scan_inclusive_inplace_hip_plus_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP exclusive_scan (plus)...\n"; - hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); -// clang-format off + // clang-format off RAJA::exclusive_scan>( RAJA::make_span(d_in, N), RAJA::make_span(d_out, N), RAJA::operators::plus{}); -// clang-format on - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -334,11 +333,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp index b7c0c26458..ff71242315 100644 --- a/exercises/segment-indexset-basics.cpp +++ b/exercises/segment-indexset-basics.cpp @@ -20,9 +20,9 @@ * * In this exercise, you will learn how to create RAJA segments and index sets * and use them to execute kernels. There are no computations performed in the - * exercises and no parallel execution. The kernels contain only print + * exercises and no parallel execution. The kernels contain only print * statements to illustrate various iteration patterns. Thus, all kernels - * look the same. The only thing that changes in these versions is the object + * look the same. The only thing that changes in these versions is the object * passed to the 'forall' method that defines the iteration space. * * RAJA features shown: @@ -39,63 +39,62 @@ // (so example code is less verbose) //----------------------------------------------------------------------------// // _raja_segment_type_start -using IdxType = int; -using RangeSegType = RAJA::TypedRangeSegment; +using IdxType = int; +using RangeSegType = RAJA::TypedRangeSegment; using RangeStrideSegType = RAJA::TypedRangeStrideSegment; -using ListSegType = RAJA::TypedListSegment; -using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >; +using ListSegType = RAJA::TypedListSegment; +using IndexSetType = RAJA::TypedIndexSet; // _raja_segment_type_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA segments index sets and index sets...\n"; -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. - camp::resources::Resource host_res{camp::resources::Host()}; + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. + camp::resources::Resource host_res {camp::resources::Host()}; -//----------------------------------------------------------------------------// -// Stride-1 iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Stride-1 iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version range kernel...\n"; // _cstyle_range1_start - for (IdxType i = 0; i < 20; i++) { - std::cout << i << " "; + for (IdxType i = 0; i < 20; i++) + { + std::cout << i << " "; } // _cstyle_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA range kernel...\n"; // _raja_range1_start - RAJA::forall(RangeSegType(0, 20), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeSegType(0, 20), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 range kernel...\n"; // _raja_striderange1_start - RAJA::forall(RangeStrideSegType(0, 20, 1), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 1), + [=](IdxType i) { std::cout << i << " "; }); // _raja_striderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 list kernel...\n"; @@ -104,47 +103,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Collect indices in a vector to create list segment // std::vector idx; - for (IdxType i = 0; i < 20; ++i) { - idx.push_back(i); - } + for (IdxType i = 0; i < 20; ++i) + { + idx.push_back(i); + } - ListSegType idx_list1( idx, host_res ); + ListSegType idx_list1(idx, host_res); - RAJA::forall(idx_list1, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1, + [=](IdxType i) { std::cout << i << " "; }); // _raja_list1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-style stride-1 list kernel...\n"; // _cstyle_list1_start IdxType iis = static_cast(idx.size()); // to avoid compiler warning - for (IdxType ii = 0; ii < iis; ++ii) { - std::cout << idx[ ii ] << " "; + for (IdxType ii = 0; ii < iis; ++ii) + { + std::cout << idx[ii] << " "; } // _cstyle_list1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Negative stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Negative stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version negative stride kernel...\n"; // _cstyle_negstriderange1_start - for (IdxType i = 19; i > -1; i--) { + for (IdxType i = 19; i > -1; i--) + { std::cout << i << " "; } // _cstyle_negstriderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA negative stride kernel...\n"; @@ -156,9 +157,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << std::endl; -//----------------------------------// -// List variant -//----------------------------------// + //----------------------------------// + // List variant + //----------------------------------// std::cout << "\n Running RAJA negative stride list kernel...\n"; @@ -166,43 +167,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Reverse the order of indices in the vector // - std::reverse( idx.begin(), idx.end() ); - ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res ); + std::reverse(idx.begin(), idx.end()); + ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res); - RAJA::forall(idx_list1_reverse, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1_reverse, + [=](IdxType i) { std::cout << i << " "; }); // _raja_negstridelist1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Non-unit uniform stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Non-unit uniform stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version stride-2 range kernel...\n"; // _cstyle_range2_start - for (IdxType i = 0; i < 20; i += 2) { + for (IdxType i = 0; i < 20; i += 2) + { std::cout << i << " "; } // _cstyle_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-2 range kernel...\n"; // _raja_range2_start - RAJA::forall(RangeStrideSegType(0, 20, 2), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 2), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-3 range kernel...\n"; @@ -214,50 +214,50 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << std::endl; -//----------------------------------------------------------------------------// -// IndexSets: complex iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // IndexSets: complex iteration spaces + //----------------------------------------------------------------------------// -// -// Sequential index set execution policy used in several of the following -// example implementations. -// + // + // Sequential index set execution policy used in several of the following + // example implementations. + // // _raja_seq_indexset_policy_start - using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; + using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; // _raja_seq_indexset_policy__end std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n"; // _raja_indexset_2ranges_start IndexSetType is2; - is2.push_back( RangeSegType(0, 10) ); - is2.push_back( RangeSegType(15, 20) ); - - RAJA::forall(is2, [=] (IdxType i) { - std::cout << i << " "; - }); + is2.push_back(RangeSegType(0, 10)); + is2.push_back(RangeSegType(15, 20)); + + RAJA::forall(is2, + [=](IdxType i) { std::cout << i << " "; }); // _raja_indexset_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-version of two segment kernel...\n"; // _cstyle_2ranges_start - for (IdxType i = 0; i < 10; ++i) { + for (IdxType i = 0; i < 10; ++i) + { std::cout << i << " "; } - for (IdxType i = 15; i < 20; ++i) { + for (IdxType i = 15; i < 20; ++i) + { std::cout << i << " "; } // _cstyle_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA index set (3 segments) kernel...\n"; @@ -265,20 +265,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Make a RAJA version of a kernel that prints the sequence - /// + /// /// 0 1 2 3 4 5 6 7 10 11 14 20 22 24 25 26 27 /// - /// using a RAJA::TypedIndexSet containing two - /// RAJA::TypedRangeSegment objects and on - /// RAJA::TypedListSegment object. + /// using a RAJA::TypedIndexSet containing two + /// RAJA::TypedRangeSegment objects and on + /// RAJA::TypedListSegment object. /// std::cout << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; - + return 0; } - diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp index 4267582d98..148ca676d2 100644 --- a/exercises/segment-indexset-basics_solution.cpp +++ b/exercises/segment-indexset-basics_solution.cpp @@ -20,9 +20,9 @@ * * In this exercise, you will learn how to create RAJA segments and index sets * and use them to execute kernels. There are no computations performed in the - * exercises and no parallel execution. The kernels contain only print + * exercises and no parallel execution. The kernels contain only print * statements to illustrate various iteration patterns. Thus, all kernels - * look the same. The only thing that changes in these versions is the object + * look the same. The only thing that changes in these versions is the object * passed to the 'forall' method that defines the iteration space. * * RAJA features shown: @@ -39,63 +39,62 @@ // (so example code is less verbose) //----------------------------------------------------------------------------// // _raja_segment_type_start -using IdxType = int; -using RangeSegType = RAJA::TypedRangeSegment; +using IdxType = int; +using RangeSegType = RAJA::TypedRangeSegment; using RangeStrideSegType = RAJA::TypedRangeStrideSegment; -using ListSegType = RAJA::TypedListSegment; -using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >; +using ListSegType = RAJA::TypedListSegment; +using IndexSetType = RAJA::TypedIndexSet; // _raja_segment_type_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA segments index sets and index sets...\n"; -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. - camp::resources::Resource host_res{camp::resources::Host()}; + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. + camp::resources::Resource host_res {camp::resources::Host()}; -//----------------------------------------------------------------------------// -// Stride-1 iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Stride-1 iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version range kernel...\n"; -// _cstyle_range1_start - for (IdxType i = 0; i < 20; i++) { - std::cout << i << " "; + // _cstyle_range1_start + for (IdxType i = 0; i < 20; i++) + { + std::cout << i << " "; } -// _cstyle_range1_end + // _cstyle_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA range kernel...\n"; // _raja_range1_start - RAJA::forall(RangeSegType(0, 20), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeSegType(0, 20), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 range kernel...\n"; // _raja_striderange1_start - RAJA::forall(RangeStrideSegType(0, 20, 1), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 1), + [=](IdxType i) { std::cout << i << " "; }); // _raja_striderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 list kernel...\n"; @@ -104,61 +103,62 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Collect indices in a vector to create list segment // std::vector idx; - for (IdxType i = 0; i < 20; ++i) { - idx.push_back(i); - } + for (IdxType i = 0; i < 20; ++i) + { + idx.push_back(i); + } - ListSegType idx_list1( idx, host_res ); + ListSegType idx_list1(idx, host_res); - RAJA::forall(idx_list1, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1, + [=](IdxType i) { std::cout << i << " "; }); // _raja_list1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-style stride-1 list kernel...\n"; // _cstyle_list1_start IdxType iis = static_cast(idx.size()); // to avoid compiler warning - for (IdxType ii = 0; ii < iis; ++ii) { - std::cout << idx[ ii ] << " "; + for (IdxType ii = 0; ii < iis; ++ii) + { + std::cout << idx[ii] << " "; } // _cstyle_list1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Negative stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Negative stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version negative stride kernel...\n"; // _cstyle_negstriderange1_start - for (IdxType i = 19; i > -1; i--) { + for (IdxType i = 19; i > -1; i--) + { std::cout << i << " "; } // _cstyle_negstriderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA negative stride kernel...\n"; // _raja_negstriderange1_start - RAJA::forall(RangeStrideSegType(19, -1, -1), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(19, -1, -1), + [=](IdxType i) { std::cout << i << " "; }); // _raja_negstriderange1_end std::cout << std::endl; -//----------------------------------// -// List variant -//----------------------------------// + //----------------------------------// + // List variant + //----------------------------------// std::cout << "\n Running RAJA negative stride list kernel...\n"; @@ -166,121 +166,117 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Reverse the order of indices in the vector // - std::reverse( idx.begin(), idx.end() ); - ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res ); + std::reverse(idx.begin(), idx.end()); + ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res); - RAJA::forall(idx_list1_reverse, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1_reverse, + [=](IdxType i) { std::cout << i << " "; }); // _raja_negstridelist1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Non-unit uniform stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Non-unit uniform stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version stride-2 range kernel...\n"; // _cstyle_range2_start - for (IdxType i = 0; i < 20; i += 2) { + for (IdxType i = 0; i < 20; i += 2) + { std::cout << i << " "; } // _cstyle_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-2 range kernel...\n"; // _raja_range2_start - RAJA::forall(RangeStrideSegType(0, 20, 2), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 2), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-3 range kernel...\n"; // _raja_range3_start - RAJA::forall(RangeStrideSegType(0, 20, 3), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 3), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range3_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// IndexSets: complex iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // IndexSets: complex iteration spaces + //----------------------------------------------------------------------------// -// -// Sequential index set execution policy used in several of the following -// example implementations. -// + // + // Sequential index set execution policy used in several of the following + // example implementations. + // std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n"; // _raja_indexset_2ranges_start - using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; + using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; IndexSetType is2; - is2.push_back( RangeSegType(0, 10) ); - is2.push_back( RangeSegType(15, 20) ); - - RAJA::forall(is2, [=] (IdxType i) { - std::cout << i << " "; - }); + is2.push_back(RangeSegType(0, 10)); + is2.push_back(RangeSegType(15, 20)); + + RAJA::forall(is2, + [=](IdxType i) { std::cout << i << " "; }); // _raja_indexset_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-version of two segment kernel...\n"; // _cstyle_2ranges_start - for (IdxType i = 0; i < 10; ++i) { + for (IdxType i = 0; i < 10; ++i) + { std::cout << i << " "; } - for (IdxType i = 15; i < 20; ++i) { + for (IdxType i = 15; i < 20; ++i) + { std::cout << i << " "; } // _cstyle_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA index set (3 segments) kernel...\n"; // _raja_indexset_3segs_start IndexSetType is3; - is3.push_back( RangeSegType(0, 8) ); + is3.push_back(RangeSegType(0, 8)); - IdxType indx[ ] = {10, 11, 14, 20, 22}; - ListSegType list2( indx, 5, host_res ); - is3.push_back( list2 ); + IdxType indx[] = {10, 11, 14, 20, 22}; + ListSegType list2(indx, 5, host_res); + is3.push_back(list2); - is3.push_back( RangeSegType(24, 28) ); - - RAJA::forall(is3, [=] (IdxType i) { - std::cout << i << " "; - }); + is3.push_back(RangeSegType(24, 28)); + + RAJA::forall(is3, + [=](IdxType i) { std::cout << i << " "; }); // _raja_indexset_3segs_end std::cout << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; - + return 0; } - diff --git a/exercises/sort.cpp b/exercises/sort.cpp index 9244ed199f..7cc1c605e5 100644 --- a/exercises/sort.cpp +++ b/exercises/sort.cpp @@ -53,7 +53,7 @@ // clang-format on #if defined(RAJA_ENABLE_HIP) -//constexpr int HIP_BLOCK_SIZE = 16; +// constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -62,14 +62,20 @@ template void checkUnstableSortResult(const T* in, const T* out, int N); template -void checkUnstableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkUnstableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void checkStableSortResult(const T* in, const T* out, int N); template -void checkStableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkStableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void printArray(const T* k, int N); @@ -83,27 +89,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA sort example...\n"; // _sort_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// - int* in = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); - unsigned* in_vals = memoryManager::allocate(N); + unsigned* in_vals = memoryManager::allocate(N); unsigned* out_vals = memoryManager::allocate(N); - std::iota(in , in + N/2, 0); - std::iota(in + N/2, in + N , 0); - std::shuffle(in , in + N/2, std::mt19937{12345u}); - std::shuffle(in + N/2, in + N , std::mt19937{67890u}); + std::iota(in, in + N / 2, 0); + std::iota(in + N / 2, in + N, 0); + std::shuffle(in, in + N / 2, std::mt19937 {12345u}); + std::shuffle(in + N / 2, in + N, std::mt19937 {67890u}); - std::fill(in_vals , in_vals + N/2, 0); - std::fill(in_vals + N/2, in_vals + N , 1); + std::fill(in_vals, in_vals + N / 2, 0); + std::fill(in_vals + N / 2, in_vals + N, 1); std::cout << "\n in keys...\n"; printArray(in, N); @@ -114,10 +120,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _sort_array_init_end -//----------------------------------------------------------------------------// -// Perform various sequential sorts to illustrate unstable/stable, -// pairs, default sorts with different comparators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential sorts to illustrate unstable/stable, + // pairs, default sorts with different comparators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (default)...\n"; @@ -125,7 +131,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec - /// execution policy type. + /// execution policy type. /// /// NOTE: We've done this one for you to help you get started... /// @@ -136,12 +142,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::sort(RAJA::make_span(out, N)); // _sort_seq_end - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (non-decreasing)...\n"; @@ -151,15 +157,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; @@ -169,15 +175,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-increasing)...\n"; @@ -187,15 +193,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution - /// policy type and an explicit greater operation. + /// policy type and an explicit greater operation. /// - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n"; @@ -206,15 +212,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA pair sort with RAJA::seq_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n"; @@ -225,10 +232,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA pair sort with RAJA::seq_exec execution - /// policy type and an explicit greater operation. + /// policy type and an explicit greater operation. /// - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; @@ -236,9 +244,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP sort (non-decreasing)...\n"; @@ -248,15 +256,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA sort with RAJA::omp_parallel_for_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n"; @@ -266,24 +274,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec execution - /// policy type and an explicit greater operation. + /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec + /// execution + /// policy type and an explicit greater operation. /// - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a couple of CUDA sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of CUDA sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n"; @@ -294,18 +304,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA pair sort with RAJA::cuda_exec execution - /// policy type and an explicit greater operation. + /// policy type and an explicit greater operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n"; @@ -315,77 +326,80 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA pair sort with RAJA::cuda_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n"; std::copy_n(in, N, out); std::copy_n(in_vals, N, out_vals); - int* d_out = memoryManager::allocate_gpu(N); + int* d_out = memoryManager::allocate_gpu(N); int* d_out_vals = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement a RAJA pair sort with RAJA::hip_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); - hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); + hipErrchk( + hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP stable_sort (non-increasing)...\n"; std::copy_n(in, N, out); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement a stable RAJA sort with RAJA::hip_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; @@ -396,11 +410,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp index a82dd5d4a5..61faf6cc84 100644 --- a/exercises/sort_solution.cpp +++ b/exercises/sort_solution.cpp @@ -62,14 +62,20 @@ constexpr int HIP_BLOCK_SIZE = 16; template void checkUnstableSortResult(const T* in, const T* out, int N); template -void checkUnstableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkUnstableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void checkStableSortResult(const T* in, const T* out, int N); template -void checkStableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkStableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void printArray(const T* k, int N); @@ -83,27 +89,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA sort example...\n"; // _sort_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// - int* in = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); - unsigned* in_vals = memoryManager::allocate(N); + unsigned* in_vals = memoryManager::allocate(N); unsigned* out_vals = memoryManager::allocate(N); - std::iota(in , in + N/2, 0); - std::iota(in + N/2, in + N , 0); - std::shuffle(in , in + N/2, std::mt19937{12345u}); - std::shuffle(in + N/2, in + N , std::mt19937{67890u}); + std::iota(in, in + N / 2, 0); + std::iota(in + N / 2, in + N, 0); + std::shuffle(in, in + N / 2, std::mt19937 {12345u}); + std::shuffle(in + N / 2, in + N, std::mt19937 {67890u}); - std::fill(in_vals , in_vals + N/2, 0); - std::fill(in_vals + N/2, in_vals + N , 1); + std::fill(in_vals, in_vals + N / 2, 0); + std::fill(in_vals + N / 2, in_vals + N, 1); std::cout << "\n in keys...\n"; printArray(in, N); @@ -114,10 +120,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _sort_array_init_end -//----------------------------------------------------------------------------// -// Perform various sequential sorts to illustrate unstable/stable, -// pairs, default sorts with different comparators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential sorts to illustrate unstable/stable, + // pairs, default sorts with different comparators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (default)...\n"; @@ -127,66 +133,66 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::sort(RAJA::make_span(out, N)); // _sort_seq_end - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (non-decreasing)...\n"; std::copy_n(in, N, out); // _sort_seq_less_start -// clang-format off + // clang-format off RAJA::sort(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_seq_less_end -// clang-format on + // clang-format on - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; std::copy_n(in, N, out); // _sort_stable_seq_less_start -// clang-format off + // clang-format off RAJA::stable_sort(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_stable_seq_less_end -// clang-format on + // clang-format on - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-increasing)...\n"; std::copy_n(in, N, out); // _sort_stable_seq_greater_start -// clang-format off + // clang-format off RAJA::stable_sort(RAJA::make_span(out, N), RAJA::operators::greater{}); // _sort_stable_seq_greater_end -// clang-format on + // clang-format on - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n"; @@ -194,19 +200,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_pairs_seq_less_start -// clang-format off + // clang-format off RAJA::sort_pairs(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::less{}); // _sort_pairs_seq_less_end -// clang-format on + // clang-format on - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n"; @@ -214,14 +221,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_stable_pairs_seq_greater_start -// clang-format off + // clang-format off RAJA::stable_sort_pairs(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::greater{}); // _sort_stable_pairs_seq_greater_end -// clang-format on + // clang-format on - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; @@ -229,27 +237,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP sort (non-decreasing)...\n"; std::copy_n(in, N, out); // _sort_omp_less_start -// clang-format off + // clang-format off RAJA::sort(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_omp_less_end -// clang-format on + // clang-format on - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n"; @@ -257,27 +265,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_stable_pairs_omp_greater_start -// clang-format off + // clang-format off RAJA::stable_sort_pairs(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::greater{}); // _sort_stable_pairs_omp_greater_end -// clang-format on + // clang-format on - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a couple of CUDA sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of CUDA sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n"; @@ -285,90 +294,94 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_pairs_cuda_greater_start -// clang-format off + // clang-format off RAJA::sort_pairs>(RAJA::make_span(out, N), RAJA::make_span(out_vals, N), RAJA::operators::greater{}); // _sort_pairs_cuda_greater_end -// clang-format on + // clang-format on - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n"; std::copy_n(in, N, out); // _sort_stable_cuda_less_start -// clang-format off + // clang-format off RAJA::stable_sort>(RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_stable_cuda_less_end -// clang-format on + // clang-format on - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n"; std::copy_n(in, N, out); std::copy_n(in_vals, N, out_vals); - int* d_out = memoryManager::allocate_gpu(N); + int* d_out = memoryManager::allocate_gpu(N); int* d_out_vals = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice)); -// clang-format off + // clang-format off RAJA::sort_pairs>(RAJA::make_span(d_out, N), RAJA::make_span(d_out_vals, N), RAJA::operators::less{}); -// clang-format on - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); - hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); + // clang-format on + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); + hipErrchk( + hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP stable_sort (non-increasing)...\n"; std::copy_n(in, N, out); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); // _sort_stable_hip_greater_start -// clang-format off + // clang-format off RAJA::stable_sort>( RAJA::make_span(d_out, N), RAJA::operators::greater{}); // _sort_stable_hip_greater_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; @@ -379,11 +392,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); diff --git a/exercises/tutorial_halfday/ex2_approx-pi.cpp b/exercises/tutorial_halfday/ex2_approx-pi.cpp index c1ccc05aee..3d45baad95 100644 --- a/exercises/tutorial_halfday/ex2_approx-pi.cpp +++ b/exercises/tutorial_halfday/ex2_approx-pi.cpp @@ -15,7 +15,7 @@ * EXERCISE #2: Approximate pi using a Riemann sum * * In this exercise, you will apprimate pi using the formula - * + * * pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the * interval [0, 1]. * @@ -28,7 +28,7 @@ * - `forall` loop iteration template method * - Index range segment * - Sum reduction - * - Execution and reduction policies + * - Execution and reduction policies */ /* @@ -46,38 +46,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n"; -// -// Define number of subintervals (N) and size of each subinterval (dx) used in -// Riemann integral sum to approximate pi. -// - const int N = 512 * 512; - const double dx = 1.0 / double(N); + // + // Define number of subintervals (N) and size of each subinterval (dx) used in + // Riemann integral sum to approximate pi. + // + const int N = 512 * 512; + const double dx = 1.0 / double(N); -// Set precision for printing pi + // Set precision for printing pi int prec = 16; -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential pi approximation...\n"; - + double c_pi = 0.0; - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi += dx / (1.0 + x * x); + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi += dx / (1.0 + x * x); } c_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl; -//----------------------------------------------------------------------------// -// RAJA sequential variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential pi approximation...\n"; @@ -85,7 +85,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall - /// method with RAJA::seq_exec execution policy type and a + /// method with RAJA::seq_exec execution policy type and a /// RAJA::ReduceSum object with RAJA::seq_reduce policy type /// to accumulate the sum. /// @@ -95,21 +95,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL1 = RAJA::seq_exec; using REDUCE_POL1 = RAJA::seq_reduce; - RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0); + RAJA::ReduceSum seq_pi(0.0); - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) { - double x = (double(i) + 0.5) * dx; - seq_pi += dx / (1.0 + x * x); - }); - double seq_pi_val = seq_pi.get() * 4.0; + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) + { + double x = (double(i) + 0.5) * dx; + seq_pi += dx / (1.0 + x * x); + }); + double seq_pi_val = seq_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << seq_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl; -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -117,22 +118,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) double c_pi_omp = 0.0; - #pragma omp parallel for reduction(+:c_pi_omp) - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi_omp += dx / (1.0 + x * x); +#pragma omp parallel for reduction(+ : c_pi_omp) + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi_omp += dx / (1.0 + x * x); } c_pi_omp *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi_omp << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -142,23 +143,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall - /// method with RAJA::omp_parallel_for_exec execution policy type + /// method with RAJA::omp_parallel_for_exec execution policy type /// and a RAJA::ReduceSum object with RAJA::omp_reduce policy type /// to accumulate the sum. - /// + /// double omp_pi_val = 0.0; - std::cout << "\tpi = " << std::setprecision(prec) - << omp_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -168,16 +168,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall - /// method with RAJA::cuda_exec execution policy type and a + /// method with RAJA::cuda_exec execution policy type and a /// RAJA::ReduceSum object with RAJA::cuda_reduce policy type /// to accumulate the sum. - /// + /// double cuda_pi_val = 0.0; - std::cout << "\tpi = " << std::setprecision(prec) - << cuda_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl; #endif diff --git a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp index 5654ffbea2..bcfcbd0025 100644 --- a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp +++ b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp @@ -15,7 +15,7 @@ * EXERCISE #2: Approximate pi using a Riemann sum * * In this exercise, you will apprimate pi using the formula - * + * * pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the * interval [0, 1]. * @@ -28,7 +28,7 @@ * - `forall` loop iteration template method * - Index range segment * - Sum reduction - * - Execution and reduction policies + * - Execution and reduction policies */ /* @@ -43,59 +43,60 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n"; -// -// Define number of subintervals (N) and size of each subinterval (dx) used in -// Riemann integral sum to approximate pi. -// - const int N = 512 * 512; - const double dx = 1.0 / double(N); + // + // Define number of subintervals (N) and size of each subinterval (dx) used in + // Riemann integral sum to approximate pi. + // + const int N = 512 * 512; + const double dx = 1.0 / double(N); -// Set precision for printing pi + // Set precision for printing pi int prec = 16; -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential pi approximation...\n"; - + double c_pi = 0.0; - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi += dx / (1.0 + x * x); + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi += dx / (1.0 + x * x); } c_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl; -//----------------------------------------------------------------------------// -// RAJA sequential variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential pi approximation...\n"; using EXEC_POL1 = RAJA::seq_exec; - using REDUCE_POL1 = RAJA::seq_reduce; + using REDUCE_POL1 = RAJA::seq_reduce; - RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0); + RAJA::ReduceSum seq_pi(0.0); - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) { - double x = (double(i) + 0.5) * dx; - seq_pi += dx / (1.0 + x * x); - }); + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) + { + double x = (double(i) + 0.5) * dx; + seq_pi += dx / (1.0 + x * x); + }); double seq_pi_val = seq_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << seq_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl; -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -103,22 +104,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) double c_pi_omp = 0.0; - #pragma omp parallel for reduction(+:c_pi_omp) - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi_omp += dx / (1.0 + x * x); +#pragma omp parallel for reduction(+ : c_pi_omp) + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi_omp += dx / (1.0 + x * x); } c_pi_omp *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi_omp << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -127,23 +128,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL2 = RAJA::omp_parallel_for_exec; using REDUCE_POL2 = RAJA::omp_reduce; - RAJA::ReduceSum< REDUCE_POL2, double > omp_pi(0.0); + RAJA::ReduceSum omp_pi(0.0); - RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=](int i) { - double x = (double(i) + 0.5) * dx; - omp_pi += dx / (1.0 + x * x); - }); + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) + { + double x = (double(i) + 0.5) * dx; + omp_pi += dx / (1.0 + x * x); + }); double omp_pi_val = omp_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << omp_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -152,16 +154,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL3 = RAJA::cuda_exec; using REDUCE_POL3 = RAJA::cuda_reduce; - RAJA::ReduceSum< REDUCE_POL3, double > cuda_pi(0.0); + RAJA::ReduceSum cuda_pi(0.0); - RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - cuda_pi += dx / (1.0 + x * x); - }); + RAJA::forall(RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE(int i) + { + double x = (double(i) + 0.5) * dx; + cuda_pi += dx / (1.0 + x * x); + }); double cuda_pi_val = cuda_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << cuda_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl; #endif diff --git a/exercises/tutorial_halfday/ex5_line-of-sight.cpp b/exercises/tutorial_halfday/ex5_line-of-sight.cpp index c94e8a4132..a2cb65ab36 100644 --- a/exercises/tutorial_halfday/ex5_line-of-sight.cpp +++ b/exercises/tutorial_halfday/ex5_line-of-sight.cpp @@ -24,30 +24,30 @@ * * Given an observation point X on a terrain map, and a set of points * {Y0, Y1, Y2, ...} along a ray starting at X, find which points on the - * terrain at Y0, Y1, etc. are visible from the point at X. A point is - * visible from the point at X if and only if there is no other point on the - * terrain that blocks its view from the point at X. More precisely, - * a point on the terrain at Y is visible from the point at X if and only if - * no other point on the terrain between X and Y has a greater vertical angle + * terrain at Y0, Y1, etc. are visible from the point at X. A point is + * visible from the point at X if and only if there is no other point on the + * terrain that blocks its view from the point at X. More precisely, + * a point on the terrain at Y is visible from the point at X if and only if + * no other point on the terrain between X and Y has a greater vertical angle * from the point at X than the point at Y. So although a point at Y may - * be at a higher altitude than all other points on the terrain between Y + * be at a higher altitude than all other points on the terrain between Y * and X, the point at Y may not be visible from the point at X. * - * Let 'altX' be the altidue at point X. Suppose we have a vector 'dist' - * such that dist[i] is the horizontal distance between X and Yi, and a - * vector 'alt' such that alt[i] is the altitude at point Yi. To solve - * the line of sight problem, we compute an angle vector 'ang', where + * Let 'altX' be the altidue at point X. Suppose we have a vector 'dist' + * such that dist[i] is the horizontal distance between X and Yi, and a + * vector 'alt' such that alt[i] is the altitude at point Yi. To solve + * the line of sight problem, we compute an angle vector 'ang', where * ang[i] = arctan( (alt[i] - altX)/(dist[i]). Next, we perform a "max" - * scan on the vector 'ang' to form the vector 'ang_max'. Then, the point + * scan on the vector 'ang' to form the vector 'ang_max'. Then, the point * at Yi is visible from the point at X if ang[i] >= ang_max[i]. Otherwise, * the point at Yi is not visible. * * This file contains a C-style sequential implementation of the solution to - * the line-of-sight problem. Where indicated by comments, you will fill in + * the line-of-sight problem. Where indicated by comments, you will fill in * sequential and OpenMP versions of the algorithm using a RAJA scan operation * to compute the 'ang_max' vector and a RAJA forall method to determine which - * points are/are not visible. If you have access to an NVIDIA GPU and a CUDA - * compiler, fill in the RAJA CUDA version of the algorithm also. + * points are/are not visible. If you have access to an NVIDIA GPU and a CUDA + * compiler, fill in the RAJA CUDA version of the algorithm also. * * RAJA features you will use: * - inclusive scan operations with 'max' operator @@ -86,62 +86,69 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array bounds and initialize distance and altitude arrays. // - int N = 100; + int N = 100; double alt_max = 100.0; - double* dist = memoryManager::allocate(N); - double* alt = memoryManager::allocate(N); - double* ang = memoryManager::allocate(N); - double* ang_max = memoryManager::allocate(N); - int* visible = memoryManager::allocate(N); + double* dist = memoryManager::allocate(N); + double* alt = memoryManager::allocate(N); + double* ang = memoryManager::allocate(N); + double* ang_max = memoryManager::allocate(N); + int* visible = memoryManager::allocate(N); int* visible_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - dist[i] = static_cast(i+1); - double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 ); - alt[i] = alt_fact * - static_cast( rand() ) / static_cast( RAND_MAX ); + for (int i = 0; i < N; ++i) + { + dist[i] = static_cast(i + 1); + double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1); + alt[i] = + alt_fact * static_cast(rand()) / static_cast(RAND_MAX); } // // Set angle array - // - for (int i = 0; i < N; ++i) { - ang[i] = atan2( alt[i], dist[i] ); // set angle in radians + // + for (int i = 0; i < N; ++i) + { + ang[i] = atan2(alt[i], dist[i]); // set angle in radians } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n"; std::memset(visible_ref, 0, N * sizeof(int)); ang_max[0] = ang[0]; - for (int i = 1; i < N; ++i) { - ang_max[i] = std::max(ang[i], ang_max[i-1]); + for (int i = 1; i < N; ++i) + { + ang_max[i] = std::max(ang[i], ang_max[i - 1]); } int num_visible = 0; - for (int i = 0; i < N; ++i) { - if ( ang[i] >= ang_max[i] ) { - visible_ref[i] = 1; - num_visible++; - } else { - visible_ref[i] = 0; - } + for (int i = 0; i < N; ++i) + { + if (ang[i] >= ang_max[i]) + { + visible_ref[i] = 1; + num_visible++; + } + else + { + visible_ref[i] = 0; + } } std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible_ref, N); + // printArray(visible_ref, N); -//----------------------------------------------------------------------------// -// RAJA sequential variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant + //----------------------------------------------------------------------------// std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n"; @@ -153,7 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs. - /// First, use a 'max' RAJA::inclusive_scan on the angle vector + /// First, use a 'max' RAJA::inclusive_scan on the angle vector /// with RAJA::seq_exec execution policy. Then, use a RAJA::forall /// template with the same execution policy to determine which /// points are visible. @@ -162,12 +169,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -181,23 +188,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs. - /// First, use a 'max' RAJA::inclusive_scan on the angle vector - /// with RAJA::omp_parallel_for_exec execution policy. Then, use - /// a RAJA::forall template with the same execution policy to + /// First, use a 'max' RAJA::inclusive_scan on the angle vector + /// with RAJA::omp_parallel_for_exec execution policy. Then, use + /// a RAJA::forall template with the same execution policy to /// determine which points are visible. /// num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -211,16 +218,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs. - /// First, use a 'max' RAJA::inclusive_scan on the angle vector - /// with RAJA::cuda_exec execution policy. Then, use a - /// RAJA::forall template with the same execution policy to + /// First, use a 'max' RAJA::inclusive_scan on the angle vector + /// with RAJA::cuda_exec execution policy. Then, use a + /// RAJA::forall template with the same execution policy to /// determine which points are visible. /// num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif @@ -248,13 +255,20 @@ int checkResult(int* visible, int* visible_ref, int len) int num_visible = 0; bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && visible[i] != visible_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && visible[i] != visible_ref[i]) + { + correct = false; + } num_visible += visible[i]; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } diff --git a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp index d5d242da86..1309fa252e 100644 --- a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp +++ b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp @@ -83,62 +83,69 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array bounds and initialize distance and altitude arrays. // - int N = 100; + int N = 100; double alt_max = 100.0; - double* dist = memoryManager::allocate(N); - double* alt = memoryManager::allocate(N); - double* ang = memoryManager::allocate(N); - double* ang_max = memoryManager::allocate(N); - int* visible = memoryManager::allocate(N); + double* dist = memoryManager::allocate(N); + double* alt = memoryManager::allocate(N); + double* ang = memoryManager::allocate(N); + double* ang_max = memoryManager::allocate(N); + int* visible = memoryManager::allocate(N); int* visible_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - dist[i] = static_cast(i+1); - double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 ); - alt[i] = alt_fact * - static_cast( rand() ) / static_cast( RAND_MAX ); + for (int i = 0; i < N; ++i) + { + dist[i] = static_cast(i + 1); + double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1); + alt[i] = + alt_fact * static_cast(rand()) / static_cast(RAND_MAX); } // // Set angle array // - for (int i = 0; i < N; ++i) { - ang[i] = atan2( alt[i], dist[i] ); // set angle in radians + for (int i = 0; i < N; ++i) + { + ang[i] = atan2(alt[i], dist[i]); // set angle in radians } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n"; std::memset(visible_ref, 0, N * sizeof(int)); ang_max[0] = ang[0]; - for (int i = 1; i < N; ++i) { - ang_max[i] = std::max(ang[i], ang_max[i-1]); + for (int i = 1; i < N; ++i) + { + ang_max[i] = std::max(ang[i], ang_max[i - 1]); } int num_visible = 0; - for (int i = 0; i < N; ++i) { - if ( ang[i] >= ang_max[i] ) { - visible_ref[i] = 1; - num_visible++; - } else { - visible_ref[i] = 0; - } + for (int i = 0; i < N; ++i) + { + if (ang[i] >= ang_max[i]) + { + visible_ref[i] = 1; + num_visible++; + } + else + { + visible_ref[i] = 0; + } } std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible_ref, N); + // printArray(visible_ref, N); -//----------------------------------------------------------------------------// -// RAJA sequential variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant + //----------------------------------------------------------------------------// std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n"; @@ -148,27 +155,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL1 = RAJA::seq_exec; - RAJA::inclusive_scan< EXEC_POL1 >(RAJA::make_span(ang, N), - RAJA::make_span(ang_max, N), - RAJA::operators::maximum{} ); + RAJA::inclusive_scan(RAJA::make_span(ang, N), + RAJA::make_span(ang_max, N), + RAJA::operators::maximum {}); - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) { - if ( ang[i] >= ang_max[i] ) { - visible[i] = 1; - } else { - visible[i] = 0; - } - }); + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) + { + if (ang[i] >= ang_max[i]) + { + visible[i] = 1; + } + else + { + visible[i] = 0; + } + }); num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -180,28 +192,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL2 = RAJA::omp_parallel_for_exec; - RAJA::inclusive_scan< EXEC_POL2 >(RAJA::make_span(ang, N), - RAJA::make_span(ang_max, N), - RAJA::operators::maximum{} ); - - RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=] (int i) { - if ( ang[i] >= ang_max[i] ) { - visible[i] = 1; - } else { - visible[i] = 0; - } - }); + RAJA::inclusive_scan(RAJA::make_span(ang, N), + RAJA::make_span(ang_max, N), + RAJA::operators::maximum {}); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) + { + if (ang[i] >= ang_max[i]) + { + visible[i] = 1; + } + else + { + visible[i] = 0; + } + }); num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -213,21 +230,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL3 = RAJA::cuda_exec; - RAJA::inclusive_scan< EXEC_POL3 >(RAJA::make_span(ang, N), - RAJA::make_span(ang_max, N), - RAJA::operators::maximum{} ); - - RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { - if ( ang[i] >= ang_max[i] ) { - visible[i] = 1; - } else { - visible[i] = 0; - } - }); + RAJA::inclusive_scan(RAJA::make_span(ang, N), + RAJA::make_span(ang_max, N), + RAJA::operators::maximum {}); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE(int i) + { + if (ang[i] >= ang_max[i]) + { + visible[i] = 1; + } + else + { + visible[i] = 0; + } + }); num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif @@ -255,13 +277,20 @@ int checkResult(int* visible, int* visible_ref, int len) int num_visible = 0; bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && visible[i] != visible_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && visible[i] != visible_ref[i]) + { + correct = false; + } num_visible += visible[i]; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp index 2362292b30..f56454975f 100644 --- a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp +++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp @@ -15,7 +15,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #6: Offset layout stencil computation. + * EXERCISE #6: Offset layout stencil computation. * * In this exercise, you will use RAJA Layouts and Views to perform * a simple 5-point stencil computation on a 2-dimensional Cartesian mesh. @@ -26,23 +26,23 @@ * The five-cell stencil accumulates values in a cell from itself and * its four neighbors. Assuming the cells are indexed using (i,j) pairs on * the two dimensional mesh, the stencil computation looks like: - * + * * out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) + * in(i, j - 1) + in(i, j + 1) * * where 'in' is the input data array and 'out' is the result of - * the stencil computation. For simplicity, in the code examples, we refer - * to the index tuples used to access input array entries as C (center), + * the stencil computation. For simplicity, in the code examples, we refer + * to the index tuples used to access input array entries as C (center), * W (west), E (east), S (south), and N (north). * - * We assume that the input array has an entry for N x M interior mesh cells + * We assume that the input array has an entry for N x M interior mesh cells * plus a one cell wide halo region around the mesh interior; i.e., the size * of the input array is (N + 2) * (M + 2). The output array has an entry * for N x M interior mesh cells only, so its size is N * M. Note that since - * the arrays have different sizes, C-style indexing requires different + * the arrays have different sizes, C-style indexing requires different * offset values in the code for accessing a cell entry in each array. - * - * The input array is initialized so that the entry for each interior cell + * + * The input array is initialized so that the entry for each interior cell * is one and the entry for each halo cell is zero. So for the case where * N = 3 and M = 2, the input array looks like: * @@ -66,7 +66,7 @@ * | 3 | 4 | 3 | * ------------- * - * You can think about indexing into this mesh as illustrated in the + * You can think about indexing into this mesh as illustrated in the * following diagram: * * --------------------------------------------------- @@ -79,31 +79,31 @@ * | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | * --------------------------------------------------- * - * Notably (0, 0) corresponds to the bottom left corner of the interior - * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom + * Notably (0, 0) corresponds to the bottom left corner of the interior + * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom * left corner of the halo region, which extends to (3, 2). * - * This file contains two C-style sequential implementations of stencil - * computation. One (Part a) has column indexing as stride-1 with the outer - * loop traversing the rows ('i' loop variable) and the inner loop traversing - * the columns ('j' loop variable). The other (Part B) has row indexing as - * stride-1 and reverses the order of the loops. This shows that a C-style - * implementation requires two different implementations, one for each loop - * order, since the array offset arithmetic is different in the two cases. - * Where indicated by comments, you will fill in versions using - * two-dimensional RAJA Views with offset layouts. One loop ordering requires - * permutations, while the other does not. If done properly, you will see - * that both RAJA versions have identical inner loop bodies, which is not the + * This file contains two C-style sequential implementations of stencil + * computation. One (Part a) has column indexing as stride-1 with the outer + * loop traversing the rows ('i' loop variable) and the inner loop traversing + * the columns ('j' loop variable). The other (Part B) has row indexing as + * stride-1 and reverses the order of the loops. This shows that a C-style + * implementation requires two different implementations, one for each loop + * order, since the array offset arithmetic is different in the two cases. + * Where indicated by comments, you will fill in versions using + * two-dimensional RAJA Views with offset layouts. One loop ordering requires + * permutations, while the other does not. If done properly, you will see + * that both RAJA versions have identical inner loop bodies, which is not the * case for the C-style variants. * - * Note that you will use the same for-loop patterns as the C-style loops. + * Note that you will use the same for-loop patterns as the C-style loops. * In a later exercise, we will show you how to use RAJA's nested loop - * support, which allows you to write both RAJA variants with identical + * support, which allows you to write both RAJA variants with identical * source code. * * RAJA features you will use: * - Offset-layouts and RAJA Views - * + * * Since this exercise is done on a CPU only, we use C++ new and delete * operators to allocate and deallocate the arrays we will use. */ @@ -111,14 +111,14 @@ // // Functions for printing and checking results // -// For array printing, 'stride1dim' indicates which mesh dimenstride is -// stride-1 (Rows indicates each row is stride-1, +// For array printing, 'stride1dim' indicates which mesh dimenstride is +// stride-1 (Rows indicates each row is stride-1, // Columns indicates each column is stride-1). // enum class Stride1 { - Rows, - Columns + Rows, + Columns }; void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim); void checkResult(int* A, int* A_ref, int Ntot); @@ -128,73 +128,76 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #6: Offset layout stencil computation...\n"; -// -// Define number of rows and columns of cells in the 2D mesh. -// - const int Nr_int = 5; + // + // Define number of rows and columns of cells in the 2D mesh. + // + const int Nr_int = 5; const int Nc_int = 8; - const int Nr_tot = Nr_int + 2; + const int Nr_tot = Nr_int + 2; const int Nc_tot = Nc_int + 2; - + const int int_cells = Nr_int * Nc_int; - const int tot_cells = Nr_tot * Nc_tot; + const int tot_cells = Nr_tot * Nc_tot; -// -// Allocate and initialize input array -// - int* B = memoryManager::allocate(tot_cells * sizeof(int)); - int* A = memoryManager::allocate(int_cells * sizeof(int)); + // + // Allocate and initialize input array + // + int* B = memoryManager::allocate(tot_cells * sizeof(int)); + int* A = memoryManager::allocate(int_cells * sizeof(int)); int* A_ref = memoryManager::allocate(int_cells * sizeof(int)); -//----------------------------------------------------------------------------// -// Part A: -// -// Variant of stencil computation with column indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Part A: + // + // Variant of stencil computation with column indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that j is the stride-1 index. -// - for (int i = 1; i <= Nc_int; ++i) { - for (int j = 1; j <= Nr_int; ++j) { + // + // We assume that for each cell id (i,j) that j is the stride-1 index. + // + for (int i = 1; i <= Nc_int; ++i) + { + for (int j = 1; j <= Nr_int; ++j) + { int idx = j + Nr_tot * i; - B[idx] = 1; + B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { int idx_out = j + Nr_int * i; - int idx_in = (j + 1) + Nr_tot * (i + 1); + int idx_in = (j + 1) + Nr_tot * (i + 1); A_ref[idx_out] = B[idx_in] + // C B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E B[idx_in - 1] + B[idx_in + 1]; // S, N - } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (no permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (no permutation). + //----------------------------------------------------------------------------// std::cout << "\n\n Running stencil computation with RAJA Views...\n"; @@ -203,114 +206,120 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE (Part A): + /// EXERCISE (Part A): /// - /// Fill in the stencil computation below where you use RAJA::View + /// Fill in the stencil computation below where you use RAJA::View /// objects for accessing entries in the A and B arrays. You will use /// a RAJA::OffsetLayout for the B array and a RAJA::Layout for the - /// A array. The B array access requires an offset since the loops - // iterate over the interior (i, j) indices. + /// A array. The B array access requires an offset since the loops + // iterate over the interior (i, j) indices. /// - /// For this part (A) of the exercise, the column (j-loop) indexing + /// For this part (A) of the exercise, the column (j-loop) indexing /// has stride 1. /// - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { // fill in the loop body - } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Part B: -// -// Variant of stencil computation with row indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Part B: + // + // Variant of stencil computation with row indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that i is the stride-1 index. -// - for (int j = 1; j <= Nr_int; ++j) { - for (int i = 1; i <= Nc_int; ++i) { + // + // We assume that for each cell id (i,j) that i is the stride-1 index. + // + for (int j = 1; j <= Nr_int; ++j) + { + for (int i = 1; i <= Nc_int; ++i) + { int idx = i + Nc_tot * j; - B[idx] = 1; + B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { int idx_out = i + Nc_int * j; - int idx_in = (i + 1) + Nc_tot * (j + 1); + int idx_in = (i + 1) + Nc_tot * (j + 1); A_ref[idx_out] = B[idx_in] + // C B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N B[idx_in - 1] + B[idx_in + 1]; // W, E - } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (with permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (with permutation). + //----------------------------------------------------------------------------// - std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n"; + std::cout + << "\n\n Running stencil computation with RAJA Views (permuted)...\n"; std::memset(A, 0, int_cells * sizeof(int)); /// /// TODO... /// - /// EXERCISE (Part B): + /// EXERCISE (Part B): /// - /// Fill in the stencil computation below where you use RAJA::View + /// Fill in the stencil computation below where you use RAJA::View /// objects for accessing entries in the A and B arrays. You will use /// a RAJA::OffsetLayout for the B array and a RAJA::Layout for the - /// A array. The B array access requires an offset since the loops + /// A array. The B array access requires an offset since the loops // iterate over the interior (i, j) indices. /// - /// For this part (A) of the exercise, the row (i-loop) indexing - /// has stride 1. Thus, layouts for the A and B arrays require + /// For this part (A) of the exercise, the row (i-loop) indexing + /// has stride 1. Thus, layouts for the A and B arrays require /// the same permutation. /// - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { // fill in the loop body - } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(B); memoryManager::deallocate(A); memoryManager::deallocate(A_ref); @@ -321,7 +330,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // -// For array printing, 'stride1dim' indicates which mesh dimenstride is +// For array printing, 'stride1dim' indicates which mesh dimenstride is // clang-format off // stride-1 (0 indicates each row is stride-1, // 1 indicates each column is stride-1). diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp index 1efd533eee..1f20b3feeb 100644 --- a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp +++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp @@ -15,7 +15,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #6: Offset layout stencil computation. + * EXERCISE #6: Offset layout stencil computation. * * In this exercise, you will use RAJA Layouts and Views to perform * a simple 5-point stencil computation on a 2-dimensional Cartesian mesh. @@ -26,23 +26,23 @@ * The five-cell stencil accumulates values in a cell from itself and * its four neighbors. Assuming the cells are indexed using (i,j) pairs on * the two dimensional mesh, the stencil computation looks like: - * + * * out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) + * in(i, j - 1) + in(i, j + 1) * * where 'in' is the input data array and 'out' is the result of - * the stencil computation. For simplicity, in the code examples, we refer - * to the index tuples used to access input array entries as C (center), + * the stencil computation. For simplicity, in the code examples, we refer + * to the index tuples used to access input array entries as C (center), * W (west), E (east), S (south), and N (north). * - * We assume that the input array has an entry for N x M interior mesh cells + * We assume that the input array has an entry for N x M interior mesh cells * plus a one cell wide halo region around the mesh interior; i.e., the size * of the input array is (N + 2) * (M + 2). The output array has an entry * for N x M interior mesh cells only, so its size is N * M. Note that since - * the arrays have different sizes, C-style indexing requires different + * the arrays have different sizes, C-style indexing requires different * offset values in the code for accessing a cell entry in each array. - * - * The input array is initialized so that the entry for each interior cell + * + * The input array is initialized so that the entry for each interior cell * is one and the entry for each halo cell is zero. So for the case where * N = 3 and M = 2, the input array looks like: * @@ -66,7 +66,7 @@ * | 3 | 4 | 3 | * ------------- * - * You can think about indexing into this mesh as illustrated in the + * You can think about indexing into this mesh as illustrated in the * following diagram: * * --------------------------------------------------- @@ -79,31 +79,31 @@ * | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | * --------------------------------------------------- * - * Notably (0, 0) corresponds to the bottom left corner of the interior - * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom + * Notably (0, 0) corresponds to the bottom left corner of the interior + * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom * left corner of the halo region, which extends to (3, 2). * - * This file contains two C-style sequential implementations of stencil - * computation. One has column indexing as stride-1 with the outer loop - * traversing the rows ('i' loop variable) and the inner loop traversing the + * This file contains two C-style sequential implementations of stencil + * computation. One has column indexing as stride-1 with the outer loop + * traversing the rows ('i' loop variable) and the inner loop traversing the * columns ('j' loop variable). The other has row indexing as stride-1 and - * reverses the order of the loops. This shows that a C-style implementation + * reverses the order of the loops. This shows that a C-style implementation * requires two different implementations, one for each loop order, since the - * array offset arithmetic is different in the two cases. Where indicated + * array offset arithmetic is different in the two cases. Where indicated * by comments, you will fill in versions using two-dimensional RAJA Views * with offset layouts. One loop ordering requires permutations, while the * other does not. If done properly, you will see that both RAJA versions * have identical inner loop bodies, which is not the case for the C-style * variants. * - * Note that you will use the same for-loop patterns as the C-style loops. + * Note that you will use the same for-loop patterns as the C-style loops. * In a later exercise, we will show you how to use RAJA's nested loop - * support, which allows you to write both RAJA variants with identical + * support, which allows you to write both RAJA variants with identical * source code. * * RAJA features you will use: * - Offset-layouts and RAJA Views - * + * * Since this exercise is done on a CPU only, we use C++ new and delete * operators to allocate and deallocate the arrays we will use. */ @@ -111,14 +111,14 @@ // // Functions for printing and checking results // -// For array printing, 'stride1dim' indicates which mesh dimenstride is -// stride-1 (Rows indicates each row is stride-1, +// For array printing, 'stride1dim' indicates which mesh dimenstride is +// stride-1 (Rows indicates each row is stride-1, // Columns indicates each column is stride-1). // enum class Stride1 { - Rows, - Columns + Rows, + Columns }; void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim); void checkResult(int* A, int* A_ref, int Ntot); @@ -128,73 +128,76 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #6: Offset layout stencil computation...\n"; -// -// Define number of rows and columns of cells in the 2D mesh. -// + // + // Define number of rows and columns of cells in the 2D mesh. + // const int DIM = 2; - const int Nr_int = 5; + const int Nr_int = 5; const int Nc_int = 8; - const int Nr_tot = Nr_int + 2; + const int Nr_tot = Nr_int + 2; const int Nc_tot = Nc_int + 2; - + const int int_cells = Nr_int * Nc_int; - const int tot_cells = Nr_tot * Nc_tot; + const int tot_cells = Nr_tot * Nc_tot; -// -// Allocate and initialize input array -// - int* B = memoryManager::allocate(tot_cells * sizeof(int)); - int* A = memoryManager::allocate(int_cells * sizeof(int)); + // + // Allocate and initialize input array + // + int* B = memoryManager::allocate(tot_cells * sizeof(int)); + int* A = memoryManager::allocate(int_cells * sizeof(int)); int* A_ref = memoryManager::allocate(int_cells * sizeof(int)); -//----------------------------------------------------------------------------// -// First variant of stencil computation with column indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // First variant of stencil computation with column indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that j is the stride-1 index. -// - for (int i = 1; i <= Nc_int; ++i) { - for (int j = 1; j <= Nr_int; ++j) { + // + // We assume that for each cell id (i,j) that j is the stride-1 index. + // + for (int i = 1; i <= Nc_int; ++i) + { + for (int j = 1; j <= Nr_int; ++j) + { int idx = j + Nr_tot * i; - B[idx] = 1; + B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { int idx_out = j + Nr_int * i; - int idx_in = (j + 1) + Nr_tot * (i + 1); + int idx_in = (j + 1) + Nr_tot * (i + 1); A_ref[idx_out] = B[idx_in] + // C B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E B[idx_in - 1] + B[idx_in + 1]; // S, N - } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (no permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (no permutation). + //----------------------------------------------------------------------------// std::cout << "\n\n Running stencil computation with RAJA Views...\n"; @@ -203,80 +206,85 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Create offset Layout and Views for data access. Note that only // the input array access requires an offset since the loops iterate over - // the interior (i, j) indices. We can use the default layout for the - // output array. Also, since the 'j' index (rightmost) is stride-1, + // the interior (i, j) indices. We can use the default layout for the + // output array. Also, since the 'j' index (rightmost) is stride-1, // we don't need a permutation for this case. // -// clang-format off + // clang-format off RAJA::OffsetLayout B_layout = RAJA::make_offset_layout({{-1, -1}}, {{Nc_tot-1, Nr_tot-1}}); -// clang-format on + // clang-format on RAJA::View> Bview(B, B_layout); RAJA::View> Aview(A, Nc_int, Nr_int); - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { - - Aview(i, j) = Bview(i, j) + // C - Bview(i - 1, j) + Bview(i + 1, j) + // W, E - Bview(i, j - 1) + Bview(i, j + 1); // S, N + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { + Aview(i, j) = Bview(i, j) + // C + Bview(i - 1, j) + Bview(i + 1, j) + // W, E + Bview(i, j - 1) + Bview(i, j + 1); // S, N } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Second variant of stencil computation with row indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Second variant of stencil computation with row indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that i is the stride-1 index. -// - for (int j = 1; j <= Nr_int; ++j) { - for (int i = 1; i <= Nc_int; ++i) { + // + // We assume that for each cell id (i,j) that i is the stride-1 index. + // + for (int j = 1; j <= Nr_int; ++j) + { + for (int i = 1; i <= Nc_int; ++i) + { int idx = i + Nc_tot * j; - B[idx] = 1; + B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { int idx_out = i + Nc_int * j; - int idx_in = (i + 1) + Nc_tot * (j + 1); + int idx_in = (i + 1) + Nc_tot * (j + 1); A_ref[idx_out] = B[idx_in] + // C B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N B[idx_in - 1] + B[idx_in + 1]; // W, E - } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (with permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (with permutation). + //----------------------------------------------------------------------------// - std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n"; + std::cout + << "\n\n Running stencil computation with RAJA Views (permuted)...\n"; std::memset(A, 0, int_cells * sizeof(int)); @@ -291,35 +299,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // application. // - std::array perm {{1, 0}}; // 'i' index (position zero0) - // is stride-1 + std::array perm {{1, 0}}; // 'i' index (position zero0) + // is stride-1 - RAJA::OffsetLayout pB_layout = - RAJA::make_permuted_offset_layout( {{-1, -1}}, {{Nc_tot-1, Nr_tot-1}}, - perm ); + RAJA::OffsetLayout pB_layout = RAJA::make_permuted_offset_layout( + {{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}}, perm); - RAJA::Layout pA_layout = - RAJA::make_permuted_layout( {{Nc_int, Nr_int}}, perm ); + RAJA::Layout pA_layout = + RAJA::make_permuted_layout({{Nc_int, Nr_int}}, perm); RAJA::View> pBview(B, pB_layout); RAJA::View> pAview(A, pA_layout); - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { - - pAview(i, j) = pBview(i, j) + // C - pBview(i - 1, j) + pBview(i + 1, j) + // W, E - pBview(i, j - 1) + pBview(i, j + 1); // S, N + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { + pAview(i, j) = pBview(i, j) + // C + pBview(i - 1, j) + pBview(i + 1, j) + // W, E + pBview(i, j - 1) + pBview(i, j + 1); // S, N } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(B); memoryManager::deallocate(A); memoryManager::deallocate(A_ref); @@ -330,7 +338,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // -// For array printing, 'stride1dim' indicates which mesh dimenstride is +// For array printing, 'stride1dim' indicates which mesh dimenstride is // clang-format off // stride-1 (0 indicates each row is stride-1, // 1 indicates each column is stride-1). diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp index 2a568de38a..cf1aa0c861 100644 --- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp +++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp @@ -16,13 +16,13 @@ /* * EXERCISE #8: Tiled Matrix Transpose * - * In this exercise, you will use RAJA constructs to transpose a matrix + * In this exercise, you will use RAJA constructs to transpose a matrix * using a loop tiling algorithm. An input matrix A of dimension N_r x N_c * is provided. You will fill in the entries of the transpose matrix At. * * This file contains a C-style variant of the sequential matrix transpose. * You will complete implementations of multiple RAJA variants by filling - * in missing elements of RAJA kernel API execution policies as well as the + * in missing elements of RAJA kernel API execution policies as well as the * RAJA kernel implementation for each. Variants you will complete include * sequential, OpenMP, and CUDA execution. * @@ -56,7 +56,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n"; @@ -70,8 +70,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -85,8 +85,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Construct a permuted layout for At so that the column index has stride 1 // std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -101,14 +100,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -116,8 +117,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries @@ -125,29 +128,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that output matrix data access // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { int col = bx * TILE_SZ + tcol; // Matrix column index int row = by * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // - // The following RAJA variants will use the RAJA::kernel method to + // The following RAJA variants will use the RAJA::kernel method to // perform the matrix transpose operation. // // Here, we define RAJA range segments to establish the iteration spaces. @@ -156,14 +161,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // global iteration number. // -// Note: this needs to be turned on for other back-ends when working the +// Note: this needs to be turned on for other back-ends when working the // exercises (sequential, CUDA, etc.) #if defined(RAJA_ENABLE_OPENMP) RAJA::RangeSegment row_Range(0, N_r); RAJA::RangeSegment col_Range(0, N_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -188,7 +193,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; #endif -// clang-format on + // clang-format on /// /// TODO... @@ -205,7 +210,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top " + "inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -229,7 +235,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; #endif -// clang-format on + // clang-format on /// /// TODO... @@ -246,9 +252,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed " + "inner loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -258,7 +265,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to/from the tile. // using KERNEL_EXEC_POL_OMP2 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -272,14 +279,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 1 >; // closes policy list -// clang-format on - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { + Atview(col, row) = Aview(row, col); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -309,7 +314,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; #endif -// clang-format on + // clang-format on /// /// TODO... diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp index b3c8fac085..40824fa496 100644 --- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp +++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose_solution.cpp @@ -54,7 +54,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n"; @@ -68,8 +68,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -83,8 +83,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Construct a permuted layout for At so that the column index has stride 1 // std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -99,14 +98,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -114,38 +115,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // // Note: loops are ordered so that output matrix data access - // is stride-1. + // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { int col = bx * TILE_SZ + tcol; // Matrix column index int row = by * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // - // The following RAJA variants will use the RAJA::kernel method to + // The following RAJA variants will use the RAJA::kernel method to // perform the matrix transpose operation. // // Here, we define RAJA range segments to establish the iteration spaces. @@ -156,7 +161,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::RangeSegment row_Range(0, N_r); RAJA::RangeSegment col_Range(0, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -167,7 +172,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_SEQ = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -182,18 +187,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top " + "inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -203,7 +208,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_OMP = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -218,21 +223,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { + Atview(col, row) = Aview(row, col); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed " + "inner loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -243,7 +247,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using KERNEL_EXEC_POL_OMP2 = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -257,14 +261,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 1 >; // closes policy list -// clang-format on - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) + { + Atview(col, row) = Aview(row, col); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -277,7 +279,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using KERNEL_EXEC_POL_CUDA = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -294,14 +296,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + // clang-format on + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE(int col, int row) + { + Atview(col, row) = Aview(row, col); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp index 000dbf5db1..e88b43124c 100644 --- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp +++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array.cpp @@ -17,10 +17,10 @@ * EXERCISE #9: Matrix Transpose with Local Array * * In this exercise, you will use RAJA constructs to transpose a matrix - * using a loop tiling algorithm similar to exercise 8. However, this + * using a loop tiling algorithm similar to exercise 8. However, this * exercise is different in that you will use a local array to write - * to and read from as each matrix tile is transposed. An input matrix - * A of dimension N_r x N_c is provided. You will fill in the entries + * to and read from as each matrix tile is transposed. An input matrix + * A of dimension N_r x N_c is provided. You will fill in the entries * of the transpose matrix At. * * This file contains a C-style variant of the sequential matrix transpose. @@ -61,7 +61,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n"; @@ -75,8 +75,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -90,8 +90,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Construct a permuted layout for At so that the column index has stride 1 // std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -106,14 +105,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of local array matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -121,8 +122,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int brow = 0; brow < outer_Dimr; ++brow) { - for (int bcol = 0; bcol < outer_Dimc; ++bcol) { + for (int brow = 0; brow < outer_Dimr; ++brow) + { + for (int bcol = 0; bcol < outer_Dimc; ++bcol) + { // Stack-allocated local array for data on a tile int Tile[TILE_SZ][TILE_SZ]; @@ -133,14 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { int col = bcol * TILE_SZ + tcol; // Matrix column index int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[trow][tcol] = Aview(row, col); } } @@ -152,25 +158,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { - for (int trow = 0; trow < TILE_SZ; ++trow) { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { + for (int trow = 0; trow < TILE_SZ; ++trow) + { int col = bcol * TILE_SZ + tcol; // Matrix column index int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[trow][tcol]; } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // The following RAJA variants will use the RAJA::kernel method to @@ -194,7 +202,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + RAJA::LocalArray, RAJA::SizeList>; // **NOTE** The LocalArray is created here, but it's memory is not yet // allocated. This is done when the 'InitLocalMem' statement @@ -203,7 +211,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) TILE_MEM RAJA_Tile; -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp index 2f122a6f12..3f733406a9 100644 --- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp +++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp @@ -17,10 +17,10 @@ * EXERCISE #9: Matrix Transpose with Local Array * * In this exercise, you will use RAJA constructs to transpose a matrix - * using a loop tiling algorithm similar to exercise 8. However, this + * using a loop tiling algorithm similar to exercise 8. However, this * exercise is different in that you will use a local array to write - * to and read from as each matrix tile is transposed. An input matrix - * A of dimension N_r x N_c is provided. You will fill in the entries + * to and read from as each matrix tile is transposed. An input matrix + * A of dimension N_r x N_c is provided. You will fill in the entries * of the transpose matrix At. * * This file contains a C-style variant of the sequential matrix transpose. @@ -61,7 +61,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); // clang-format on -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n"; @@ -75,8 +75,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -90,8 +90,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Construct a permuted layout for At so that the column index has stride 1 // std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -106,14 +105,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of local array matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -121,8 +122,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int brow = 0; brow < outer_Dimr; ++brow) { - for (int bcol = 0; bcol < outer_Dimc; ++bcol) { + for (int brow = 0; brow < outer_Dimr; ++brow) + { + for (int bcol = 0; bcol < outer_Dimc; ++bcol) + { // Stack-allocated local array for data on a tile int Tile[TILE_SZ][TILE_SZ]; @@ -133,14 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { int col = bcol * TILE_SZ + tcol; // Matrix column index int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[trow][tcol] = Aview(row, col); } } @@ -152,25 +158,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { - for (int trow = 0; trow < TILE_SZ; ++trow) { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { + for (int trow = 0; trow < TILE_SZ; ++trow) + { int col = bcol * TILE_SZ + tcol; // Matrix column index int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[trow][tcol]; } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // The following RAJA variants will use the RAJA::kernel method to @@ -192,7 +200,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + RAJA::LocalArray, RAJA::SizeList>; // **NOTE** The LocalArray is created here, but it's memory is not yet // allocated. This is done when the 'InitLocalMem' statement @@ -201,12 +209,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) TILE_MEM RAJA_Tile; -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); using SEQ_EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -236,22 +244,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), - - RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - - [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { - - RAJA_Tile(trow, tcol) = Aview(row, col); - - }, + // clang-format on + RAJA::kernel_param( + RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { + RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - Atview(col, row) = RAJA_Tile(trow, tcol); + [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) + { + RAJA_Tile(trow, tcol) = Aview(row, col); + }, - }); + [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) + { + Atview(col, row) = RAJA_Tile(trow, tcol); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -264,7 +271,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using OPENMP_EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, @@ -293,21 +300,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on - RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), + // clang-format on + RAJA::kernel_param( + RAJA::make_tuple(col_Range, row_Range), RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { - + [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) + { RAJA_Tile(trow, tcol) = Aview(row, col); - }, - [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { - + [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) + { Atview(col, row) = RAJA_Tile(trow, tcol); - }); checkResult(Atview, N_c, N_r); @@ -323,7 +329,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using CUDA_EXEC_POL = -// clang-format off + // clang-format off RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, @@ -358,22 +364,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; -// clang-format on + // clang-format on - RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel_param( + RAJA::make_tuple(col_Range, row_Range), RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { - + [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) + { RAJA_Tile(trow, tcol) = Aview(row, col); - }, - [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { - + [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) + { Atview(col, row) = RAJA_Tile(trow, tcol); - }); checkResult(Atview, N_c, N_r); diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp index 7b9b36c2d9..3192af15d2 100644 --- a/exercises/vector-addition.cpp +++ b/exercises/vector-addition.cpp @@ -16,7 +16,7 @@ /* * Vector Addition Exercise * - * In this exercise, you will compute c = a + b, where a, b, c are + * In this exercise, you will compute c = a + b, where a, b, c are * integer vectors. * * This file contains sequential and OpenMP variants of the vector addition @@ -24,7 +24,7 @@ * plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA * compiler, in empty code sections indicated by comments. * - * The exercise shows you how to use RAJA in its simplest form and + * The exercise shows you how to use RAJA in its simplest form and * illustrates similarities between a C-style for-loop and a RAJA forall loop. * * RAJA features you will use: @@ -32,75 +32,77 @@ * - Index range segment * - Execution policies * - * Note: if CUDA is enabled, CUDA unified memory is used. + * Note: if CUDA is enabled, CUDA unified memory is used. */ /* Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//constexpr int CUDA_BLOCK_SIZE = 256; +// constexpr int CUDA_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_HIP) -//constexpr int HIP_BLOCK_SIZE = 256; +// constexpr int HIP_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_SYCL) -//constexpr int SYCL_BLOCK_SIZE = 256; +// constexpr int SYCL_BLOCK_SIZE = 256; #endif // // Functions for checking and printing arrays // -void checkResult(int* c, int* c_ref, int len); +void checkResult(int* c, int* c_ref, int len); void printArray(int* v, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA Vector Addition...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data to random numbers in [1, 10]. -// - int *a = memoryManager::allocate(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); - int *c_ref = memoryManager::allocate(N); + // + // Allocate and initialize vector data to random numbers in [1, 10]. + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); + int* c_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = rand() % 10 + 1; b[i] = rand() % 10 + 1; } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::memset(c_ref, 0, N * sizeof(int)); std::cout << "\n Running C-style sequential vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c_ref[i] = a[i] + b[i]; } // _cstyle_vector_add_end -//printArray(c_ref, N); + // printArray(c_ref, N); -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); @@ -110,27 +112,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the vector addition kernel using a RAJA::forall - /// method and RAJA::seq_exec execution policy type. + /// method and RAJA::seq_exec execution policy type. /// /// NOTE: We've done this one for you to help you get started... /// // _rajaseq_vector_add_start -// clang-format off + // clang-format off RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; }); // _rajaseq_vector_add_end -// clang-format on + // clang-format on checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::simd_exec policy attempts to force the compiler to generate SIMD -// vectorization optimizations. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::simd_exec policy attempts to force the compiler to generate SIMD + // vectorization optimizations. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); @@ -144,12 +146,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -157,21 +159,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style OpenMP vector addition...\n"; - #pragma omp parallel for - for (int i = 0; i < N; ++i) { +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_exec policy runs the loop in parallel using -// OpenMP multithreading. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_exec policy runs the loop in parallel using + // OpenMP multithreading. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -187,13 +190,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// checkResult(c, c_ref, N); -//printArray(c, N); +// printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -201,12 +204,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice )); - cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice)); + cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice)); /// /// TODO... @@ -215,53 +218,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// method and RAJA::cuda_exec execution policy type. /// /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a -// GPU device with 2 blocks per SM. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a + // GPU device with 2 blocks per SM. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); - std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; + std::cout + << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; /// /// TODO... /// /// EXERCISE: Implement the vector addition kernel using a RAJA::forall - /// method and RAJA::cuda_exec execution policy type with + /// method and RAJA::cuda_exec execution policy type with /// arguments defining 2 blocks per SM and asynchronous execution. /// /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); +// printResult(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... @@ -270,29 +274,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// method and RAJA::hip_exec execution policy type. /// /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// - hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// -// RAJA::sycl_exec policy runs the loop as a SYCL kernel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::sycl_exec policy runs the loop as a SYCL kernel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int)); memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); @@ -304,24 +308,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// method and RAJA::hip_exec execution policy type. /// /// NOTE: You will have to uncomment 'SYCL_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -338,12 +342,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(int* c, int* c_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && c[i] != c_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && c[i] != c_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -354,9 +365,9 @@ void checkResult(int* c, int* c_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; } - diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp index 9bc491ad0c..804a3de174 100644 --- a/exercises/vector-addition_solution.cpp +++ b/exercises/vector-addition_solution.cpp @@ -16,7 +16,7 @@ /* * Vector Addition Exercise * - * In this exercise, you will compute c = a + b, where a, b, c are + * In this exercise, you will compute c = a + b, where a, b, c are * integer vectors. * * This file contains sequential and OpenMP variants of the vector addition @@ -24,7 +24,7 @@ * plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA * compiler, in empty code sections indicated by comments. * - * The exercise shows you how to use RAJA in its simplest form and + * The exercise shows you how to use RAJA in its simplest form and * illustrates similarities between a C-style for-loop and a RAJA forall loop. * * RAJA features you will use: @@ -32,7 +32,7 @@ * - Index range segment * - Execution policies * - * Note: if CUDA is enabled, CUDA unified memory is used. + * Note: if CUDA is enabled, CUDA unified memory is used. */ /* @@ -53,95 +53,94 @@ constexpr int SYCL_BLOCK_SIZE = 256; // // Functions for checking and printing arrays // -void checkResult(int* c, int* c_ref, int len); +void checkResult(int* c, int* c_ref, int len); void printArray(int* v, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA Vector Addition...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data to random numbers in [1, 10]. -// - int *a = memoryManager::allocate(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); - int *c_ref = memoryManager::allocate(N); + // + // Allocate and initialize vector data to random numbers in [1, 10]. + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); + int* c_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = rand() % 10 + 1; b[i] = rand() % 10 + 1; } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::memset(c_ref, 0, N * sizeof(int)); std::cout << "\n Running C-style sequential vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c_ref[i] = a[i] + b[i]; } // _cstyle_vector_add_end -//printArray(c_ref, N); + // printArray(c_ref, N); -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); std::cout << "\n Running RAJA sequential vector addition...\n"; // _rajaseq_vector_add_start - RAJA::forall< RAJA::seq_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - } - ); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); // _rajaseq_vector_add_end checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::simd_exec policy attempts to force the compiler to generate SIMD -// vectorization optimizations. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::simd_exec policy attempts to force the compiler to generate SIMD + // vectorization optimizations. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); std::cout << "\n Running RAJA SIMD vector addition...\n"; -// clang-format off + // clang-format off RAJA::forall( RAJA::TypedRangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; } - ); + ); -// clang-format on + // clang-format on checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -149,21 +148,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style OpenMP vector addition...\n"; - #pragma omp parallel for - for (int i = 0; i < N; ++i) { +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_exec policy runs the loop in parallel using -// OpenMP multithreading. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_exec policy runs the loop in parallel using + // OpenMP multithreading. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -172,21 +172,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA OpenMP multithreaded vector addition...\n"; // _rajaomp_vector_add_start - RAJA::forall< RAJA::omp_parallel_for_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - } - ); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); // _rajaomp_vector_add_end checkResult(c, c_ref, N); -//printArray(c, N); +// printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -194,124 +191,125 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice )); - cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice)); + cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice)); // _rajacuda_vector_add_start -// clang-format off + // clang-format off RAJA::forall< RAJA::cuda_exec >(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajacuda_vector_add_end -// clang-format on + // clang-format on - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a -// GPU device with 2 blocks per SM. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a + // GPU device with 2 blocks per SM. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); - std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; + std::cout + << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; // _rajacuda_explicit_vector_add_start const bool Asynchronous = true; -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajacuda_explicit_vector_add_end -// clang-format on + // clang-format on - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); +// printResult(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice)); // _rajahip_vector_add_start -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajahip_vector_add_end -// clang-format on + // clang-format on - hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// -// RAJA::sycl_exec policy runs the loop as a SYCL kernel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::sycl_exec policy runs the loop as a SYCL kernel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int)); memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); // _rajasycl_vector_add_start -// clang-format off + // clang-format off RAJA::forall>(RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE (int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajasycl_vector_add_end -// clang-format on + // clang-format on memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -328,12 +326,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(int* c, int* c_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && c[i] != c_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && c[i] != c_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -344,9 +349,9 @@ void checkResult(int* c, int* c_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; } - diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp index 5763e5b48f..6c33cdedb7 100644 --- a/exercises/vertexsum-indexset.cpp +++ b/exercises/vertexsum-indexset.cpp @@ -20,7 +20,7 @@ /* * Mesh vertex area exercise * - * In this exercise, you will use a RAJA TypedIndexSet containing 4 + * In this exercise, you will use a RAJA TypedIndexSet containing 4 * TypedListSegments to parallelize the mesh vertex area computation. * A sum is computed at each vertex on a logically-Cartesian 2D mesh * where the sum represents the vertex "area" as an average of the 4 @@ -32,13 +32,13 @@ * each subset. When the ListSegments are put into an IndexSet, the entire * computation can be executed with one RAJA::forall() statement, where * you iterate over the segments sequentially and execute each segment in - * parallel. This exercise illustrates how RAJA can be used to enable one + * parallel. This exercise illustrates how RAJA can be used to enable one * to get some parallelism from such operations without fundamentally * changing the way the algorithm looks in source code. * * This file contains sequential and OpenMP variants of the vertex area - * computation using C-style for-loops. You will fill in RAJA versions of - * these variants, plus a RAJA CUDA version if you have access to an NVIDIA + * computation using C-style for-loops. You will fill in RAJA versions of + * these variants, plus a RAJA CUDA version if you have access to an NVIDIA * GPU and a CUDA compiler, in empty code sections indicated by comments. * * RAJA features you will use: @@ -68,189 +68,204 @@ void checkResult(double* a, double* aref, int n); void printMeshData(double* v, int n, int joff); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n"; -// _vertexsum_define_start -// -// 2D mesh has N^2 elements (N+1)^2 vertices. -// - constexpr int N = 1000; - constexpr int Nelem = N; + // _vertexsum_define_start + // + // 2D mesh has N^2 elements (N+1)^2 vertices. + // + constexpr int N = 1000; + constexpr int Nelem = N; constexpr int Nelem_tot = Nelem * Nelem; - constexpr int Nvert = N + 1; + constexpr int Nvert = N + 1; constexpr int Nvert_tot = Nvert * Nvert; -// _vertexsum_define_end - double* areae = memoryManager::allocate(Nelem_tot); - double* areav = memoryManager::allocate(Nvert_tot); + // _vertexsum_define_end + double* areae = memoryManager::allocate(Nelem_tot); + double* areav = memoryManager::allocate(Nvert_tot); double* areav_ref = memoryManager::allocate(Nvert_tot); - int* e2v_map = memoryManager::allocate(4*Nelem_tot); + int* e2v_map = memoryManager::allocate(4 * Nelem_tot); -// _vertexsum_elemarea_start -// -// Define mesh spacing factor 'h' and set up elem to vertex mapping array. -// + // _vertexsum_elemarea_start + // + // Define mesh spacing factor 'h' and set up elem to vertex mapping array. + // constexpr double h = 0.1; - for (int ie = 0; ie < Nelem_tot; ++ie) { - int j = ie / Nelem; - int imap = 4 * ie ; - e2v_map[imap] = ie + j; - e2v_map[imap+1] = ie + j + 1; - e2v_map[imap+2] = ie + j + Nvert; - e2v_map[imap+3] = ie + j + 1 + Nvert; + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int j = ie / Nelem; + int imap = 4 * ie; + e2v_map[imap] = ie + j; + e2v_map[imap + 1] = ie + j + 1; + e2v_map[imap + 2] = ie + j + Nvert; + e2v_map[imap + 3] = ie + j + 1 + Nvert; } -// -// Initialize element areas so each element area -// depends on the i,j coordinates of the element. -// + // + // Initialize element areas so each element area + // depends on the i,j coordinates of the element. + // std::memset(areae, 0, Nelem_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { - int i = ie % Nelem; - int j = ie / Nelem; - areae[ie] = h*(i+1) * h*(j+1); + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int i = ie % Nelem; + int j = ie / Nelem; + areae[ie] = h * (i + 1) * h * (j + 1); } -// _vertexsum_elemarea_end + // _vertexsum_elemarea_end -//std::cout << "\n Element areas...\n"; -//printMeshData(areae, Nelem, Nelem); + // std::cout << "\n Element areas...\n"; + // printMeshData(areae, Nelem, Nelem); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running sequential C-style version of vertex sum...\n"; -// _cstyle_vertexarea_seq_start + // _cstyle_vertexarea_seq_start std::memset(areav_ref, 0, Nvert_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { - int* iv = &(e2v_map[4*ie]); - areav_ref[ iv[0] ] += areae[ie] / 4.0 ; - areav_ref[ iv[1] ] += areae[ie] / 4.0 ; - areav_ref[ iv[2] ] += areae[ie] / 4.0 ; - areav_ref[ iv[3] ] += areae[ie] / 4.0 ; + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int* iv = &(e2v_map[4 * ie]); + areav_ref[iv[0]] += areae[ie] / 4.0; + areav_ref[iv[1]] += areae[ie] / 4.0; + areav_ref[iv[2]] += areae[ie] / 4.0; + areav_ref[iv[3]] += areae[ie] / 4.0; } -// _cstyle_vertexarea_seq_end - -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); - - -//----------------------------------------------------------------------------// -// -// In the following, we partition the element iteration space into four -// subsets (or "colors") indicated by numbers in the figure below. -// -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// -// Since none of the elements with the same number share a common vertex, -// we can iterate over each subset ("color") in parallel. -// -// We use RAJA ListSegments and a RAJA IndexSet to define the element -// partitioning. -// - -// _vertexarea_color_start -// -// Gather the element indices for each color in a vector. -// - std::vector< std::vector > idx(4); - - for (int ie = 0; ie < Nelem_tot; ++ie) { + // _cstyle_vertexarea_seq_end + + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); + + + //----------------------------------------------------------------------------// + // + // In the following, we partition the element iteration space into four + // subsets (or "colors") indicated by numbers in the figure below. + // + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // + // Since none of the elements with the same number share a common vertex, + // we can iterate over each subset ("color") in parallel. + // + // We use RAJA ListSegments and a RAJA IndexSet to define the element + // partitioning. + // + + // _vertexarea_color_start + // + // Gather the element indices for each color in a vector. + // + std::vector> idx(4); + + for (int ie = 0; ie < Nelem_tot; ++ie) + { int i = ie % Nelem; int j = ie / Nelem; - if ( i % 2 == 0 ) { - if ( j % 2 == 0 ) { + if (i % 2 == 0) + { + if (j % 2 == 0) + { idx[0].push_back(ie); - } else { + } + else + { idx[2].push_back(ie); } - } else { - if ( j % 2 == 0 ) { + } + else + { + if (j % 2 == 0) + { idx[1].push_back(ie); - } else { + } + else + { idx[3].push_back(ie); } } } -// _vertexarea_color_end + // _vertexarea_color_end -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. Note that we use the vectors -// defined above in this variant to run each element subset in parallel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. Note that we use the vectors + // defined above in this variant to run each element subset in parallel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running C-style OpenMP vertex sum...\n"; -// _cstyle_vertexarea_omp_start + // _cstyle_vertexarea_omp_start std::memset(areav, 0, Nvert_tot * sizeof(double)); - for (int icol = 0; icol < 4; ++icol) { - const std::vector& ievec = idx[icol]; - const int len = static_cast(ievec.size()); - - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - int ie = ievec[i]; - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - } - + for (int icol = 0; icol < 4; ++icol) + { + const std::vector& ievec = idx[icol]; + const int len = static_cast(ievec.size()); + +#pragma omp parallel for + for (int i = 0; i < len; ++i) + { + int ie = ievec[i]; + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + } } -// _cstyle_vertexarea_omp_end + // _cstyle_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); #endif // The IndexSet is a variadic template, where the template arguments -// are the segment types that the IndexSet can hold. -// -#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) -// _vertexarea_listsegtype_start +// are the segment types that the IndexSet can hold. +// +#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || \ + defined(RAJA_ENABLE_HIP) + // _vertexarea_listsegtype_start using SegmentType = RAJA::TypedListSegment; // _vertexarea_listsegtype_end #endif #if defined(RAJA_ENABLE_OPENMP) -// -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. -// - camp::resources::Resource host_res{camp::resources::Host()}; + // + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. + // + camp::resources::Resource host_res {camp::resources::Host()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet colorset; - colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); + colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res)); /// /// TODO... @@ -260,56 +275,58 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// below to check if it's correct. /// -//----------------------------------------------------------------------------// -// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration -// over segments, OpenMP parallel iteration of each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration + // over segments, OpenMP parallel iteration of each segment) + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_omp_start - using EXEC_POL1 = RAJA::ExecPolicy; + // _raja_vertexarea_omp_start + using EXEC_POL1 = + RAJA::ExecPolicy; - RAJA::forall(colorset, [=](int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - }); -// _raja_vertexarea_omp_end + RAJA::forall(colorset, + [=](int ie) + { + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + }); + // _raja_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, Nvert); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, Nvert); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration -// over segments, CUDA kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration + // over segments, CUDA kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// - camp::resources::Resource cuda_res{camp::resources::Cuda()}; + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // + camp::resources::Resource cuda_res {camp::resources::Cuda()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet cuda_colorset; - cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) ); + cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res)); /// /// TODO... @@ -321,88 +338,93 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_cuda_start -// clang-format off + // _raja_vertexarea_cuda_start + // clang-format off using EXEC_POL2 = RAJA::ExecPolicy>; -// clang-format on - RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - }); -// _raja_vertexarea_cuda_end + // clang-format on + RAJA::forall(cuda_colorset, + [=] RAJA_DEVICE(int ie) + { + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + }); + // _raja_vertexarea_cuda_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); #endif -//----------------------------------------------------------------------------// -// RAJA HIP vertex sum calculation using IndexSet (sequential iteration -// over segments, HIP kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP vertex sum calculation using IndexSet (sequential iteration + // over segments, HIP kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -// -// Allocate and initialize device memory arrays -// + // + // Allocate and initialize device memory arrays + // double* d_areae = memoryManager::allocate_gpu(Nelem_tot); double* d_areav = memoryManager::allocate_gpu(Nvert_tot); - int* d_e2v_map = memoryManager::allocate_gpu(4*Nelem_tot); + int* d_e2v_map = memoryManager::allocate_gpu(4 * Nelem_tot); - hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice); - hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), + hipMemcpyHostToDevice); std::memset(areav, 0, Nvert_tot * sizeof(double)); - hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice); -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// - camp::resources::Resource hip_res{camp::resources::Hip()}; + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // + camp::resources::Resource hip_res {camp::resources::Hip()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet hip_colorset; - hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) ); + hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res)); std::cout << "\n Running RAJA HIP index set vertex sum...\n"; -// _raja_vertexarea_hip_start -// clang-format off + // _raja_vertexarea_hip_start + // clang-format off using EXEC_POL3 = RAJA::ExecPolicy>; -// clang-format on - RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(d_e2v_map[4*ie]); - d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[1] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[2] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[3] ] += d_areae[ie] / 4.0 ; - }); -// _raja_vertexarea_hip_end - - hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost); + // clang-format on + RAJA::forall(hip_colorset, + [=] RAJA_DEVICE(int ie) + { + int* iv = &(d_e2v_map[4 * ie]); + d_areav[iv[0]] += d_areae[ie] / 4.0; + d_areav[iv[1]] += d_areae[ie] / 4.0; + d_areav[iv[2]] += d_areae[ie] / 4.0; + d_areav[iv[3]] += d_areae[ie] / 4.0; + }); + // _raja_vertexarea_hip_end + + hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost); checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); memoryManager::deallocate_gpu(d_areae); memoryManager::deallocate_gpu(d_areav); @@ -410,7 +432,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(areae); @@ -429,12 +451,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(double* a, double* aref, int n) { bool correct = true; - for (int i = 0; i < n*n; i++) { - if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; } + for (int i = 0; i < n * n; i++) + { + if (correct && std::abs(a[i] - aref[i]) > 10e-12) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -445,11 +474,12 @@ void checkResult(double* a, double* aref, int n) void printMeshData(double* v, int n, int joff) { std::cout << std::endl; - for (int j = 0 ; j < n ; ++j) { - for (int i = 0 ; i < n ; ++i) { - int ii = i + j*joff ; - std::cout << "v(" << i << "," << j << ") = " - << v[ii] << std::endl; + for (int j = 0; j < n; ++j) + { + for (int i = 0; i < n; ++i) + { + int ii = i + j * joff; + std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl; } } std::cout << std::endl; diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp index d81fc9487d..d5baca47c8 100644 --- a/exercises/vertexsum-indexset_solution.cpp +++ b/exercises/vertexsum-indexset_solution.cpp @@ -20,7 +20,7 @@ /* * Mesh vertex area exercise * - * In this exercise, you will use a RAJA TypedIndexSet containing 4 + * In this exercise, you will use a RAJA TypedIndexSet containing 4 * TypedListSegments to parallelize the mesh vertex area computation. * A sum is computed at each vertex on a logically-Cartesian 2D mesh * where the sum represents the vertex "area" as an average of the 4 @@ -32,13 +32,13 @@ * each subset. When the ListSegments are put into an IndexSet, the entire * computation can be executed with one RAJA::forall() statement, where * you iterate over the segments sequentially and execute each segment in - * parallel. This exercise illustrates how RAJA can be used to enable one + * parallel. This exercise illustrates how RAJA can be used to enable one * to get some parallelism from such operations without fundamentally * changing the way the algorithm looks in source code. * * This file contains sequential and OpenMP variants of the vertex area - * computation using C-style for-loops. You will fill in RAJA versions of - * these variants, plus a RAJA CUDA version if you have access to an NVIDIA + * computation using C-style for-loops. You will fill in RAJA versions of + * these variants, plus a RAJA CUDA version if you have access to an NVIDIA * GPU and a CUDA compiler, in empty code sections indicated by comments. * * RAJA features you will use: @@ -68,333 +68,355 @@ void checkResult(double* a, double* aref, int n); void printMeshData(double* v, int n, int joff); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n"; -// _vertexsum_define_start -// -// 2D mesh has N^2 elements (N+1)^2 vertices. -// - constexpr int N = 1000; - constexpr int Nelem = N; + // _vertexsum_define_start + // + // 2D mesh has N^2 elements (N+1)^2 vertices. + // + constexpr int N = 1000; + constexpr int Nelem = N; constexpr int Nelem_tot = Nelem * Nelem; - constexpr int Nvert = N + 1; + constexpr int Nvert = N + 1; constexpr int Nvert_tot = Nvert * Nvert; -// _vertexsum_define_end - double* areae = memoryManager::allocate(Nelem_tot); - double* areav = memoryManager::allocate(Nvert_tot); + // _vertexsum_define_end + double* areae = memoryManager::allocate(Nelem_tot); + double* areav = memoryManager::allocate(Nvert_tot); double* areav_ref = memoryManager::allocate(Nvert_tot); - int* e2v_map = memoryManager::allocate(4*Nelem_tot); + int* e2v_map = memoryManager::allocate(4 * Nelem_tot); -// _vertexsum_elemarea_start -// -// Define mesh spacing factor 'h' and set up elem to vertex mapping array. -// + // _vertexsum_elemarea_start + // + // Define mesh spacing factor 'h' and set up elem to vertex mapping array. + // constexpr double h = 0.1; - for (int ie = 0; ie < Nelem_tot; ++ie) { - int j = ie / Nelem; - int imap = 4 * ie ; - e2v_map[imap] = ie + j; - e2v_map[imap+1] = ie + j + 1; - e2v_map[imap+2] = ie + j + Nvert; - e2v_map[imap+3] = ie + j + 1 + Nvert; + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int j = ie / Nelem; + int imap = 4 * ie; + e2v_map[imap] = ie + j; + e2v_map[imap + 1] = ie + j + 1; + e2v_map[imap + 2] = ie + j + Nvert; + e2v_map[imap + 3] = ie + j + 1 + Nvert; } -// -// Initialize element areas so each element area -// depends on the i,j coordinates of the element. -// + // + // Initialize element areas so each element area + // depends on the i,j coordinates of the element. + // std::memset(areae, 0, Nelem_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { - int i = ie % Nelem; - int j = ie / Nelem; - areae[ie] = h*(i+1) * h*(j+1); + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int i = ie % Nelem; + int j = ie / Nelem; + areae[ie] = h * (i + 1) * h * (j + 1); } -// _vertexsum_elemarea_end + // _vertexsum_elemarea_end -//std::cout << "\n Element areas...\n"; -//printMeshData(areae, Nelem, Nelem); + // std::cout << "\n Element areas...\n"; + // printMeshData(areae, Nelem, Nelem); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running sequential C-style version of vertex sum...\n"; -// _cstyle_vertexarea_seq_start + // _cstyle_vertexarea_seq_start std::memset(areav_ref, 0, Nvert_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { - int* iv = &(e2v_map[4*ie]); - areav_ref[ iv[0] ] += areae[ie] / 4.0 ; - areav_ref[ iv[1] ] += areae[ie] / 4.0 ; - areav_ref[ iv[2] ] += areae[ie] / 4.0 ; - areav_ref[ iv[3] ] += areae[ie] / 4.0 ; + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int* iv = &(e2v_map[4 * ie]); + areav_ref[iv[0]] += areae[ie] / 4.0; + areav_ref[iv[1]] += areae[ie] / 4.0; + areav_ref[iv[2]] += areae[ie] / 4.0; + areav_ref[iv[3]] += areae[ie] / 4.0; } -// _cstyle_vertexarea_seq_end - -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); - - -//----------------------------------------------------------------------------// -// -// In the following, we partition the element iteration space into four -// subsets (or "colors") indicated by numbers in the figure below. -// -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// -// Since none of the elements with the same number share a common vertex, -// we can iterate over each subset ("color") in parallel. -// -// We use RAJA ListSegments and a RAJA IndexSet to define the element -// partitioning. -// - -// _vertexarea_color_start -// -// Gather the element indices for each color in a vector. -// - std::vector< std::vector > idx(4); - - for (int ie = 0; ie < Nelem_tot; ++ie) { + // _cstyle_vertexarea_seq_end + + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); + + + //----------------------------------------------------------------------------// + // + // In the following, we partition the element iteration space into four + // subsets (or "colors") indicated by numbers in the figure below. + // + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // + // Since none of the elements with the same number share a common vertex, + // we can iterate over each subset ("color") in parallel. + // + // We use RAJA ListSegments and a RAJA IndexSet to define the element + // partitioning. + // + + // _vertexarea_color_start + // + // Gather the element indices for each color in a vector. + // + std::vector> idx(4); + + for (int ie = 0; ie < Nelem_tot; ++ie) + { int i = ie % Nelem; int j = ie / Nelem; - if ( i % 2 == 0 ) { - if ( j % 2 == 0 ) { + if (i % 2 == 0) + { + if (j % 2 == 0) + { idx[0].push_back(ie); - } else { + } + else + { idx[2].push_back(ie); } - } else { - if ( j % 2 == 0 ) { + } + else + { + if (j % 2 == 0) + { idx[1].push_back(ie); - } else { + } + else + { idx[3].push_back(ie); } } } -// _vertexarea_color_end + // _vertexarea_color_end -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. Note that we use the vectors -// defined above in this variant to run each element subset in parallel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. Note that we use the vectors + // defined above in this variant to run each element subset in parallel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running C-style OpenMP vertex sum...\n"; -// _cstyle_vertexarea_omp_start + // _cstyle_vertexarea_omp_start std::memset(areav, 0, Nvert_tot * sizeof(double)); - for (int icol = 0; icol < 4; ++icol) { - const std::vector& ievec = idx[icol]; - const int len = static_cast(ievec.size()); - - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - int ie = ievec[i]; - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - } - + for (int icol = 0; icol < 4; ++icol) + { + const std::vector& ievec = idx[icol]; + const int len = static_cast(ievec.size()); + +#pragma omp parallel for + for (int i = 0; i < len; ++i) + { + int ie = ievec[i]; + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + } } -// _cstyle_vertexarea_omp_end + // _cstyle_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); #endif // The IndexSet is a variadic template, where the template arguments -// are the segment types that the IndexSet can hold. -// -#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) -// _vertexarea_listsegtype_start +// are the segment types that the IndexSet can hold. +// +#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || \ + defined(RAJA_ENABLE_HIP) + // _vertexarea_listsegtype_start using SegmentType = RAJA::TypedListSegment; // _vertexarea_listsegtype_end #endif #if defined(RAJA_ENABLE_OPENMP) -// -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. -// - camp::resources::Resource host_res{camp::resources::Host()}; + // + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. + // + camp::resources::Resource host_res {camp::resources::Host()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. -// _vertexarea_indexset_start + // _vertexarea_indexset_start RAJA::TypedIndexSet colorset; - colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); - colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); - colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); - colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) ); -// _vertexarea_indexset_end + colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res)); + colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), host_res)); + colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), host_res)); + colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), host_res)); + // _vertexarea_indexset_end -//----------------------------------------------------------------------------// -// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration -// over segments, OpenMP parallel iteration of each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration + // over segments, OpenMP parallel iteration of each segment) + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_omp_start - using EXEC_POL1 = RAJA::ExecPolicy; + // _raja_vertexarea_omp_start + using EXEC_POL1 = + RAJA::ExecPolicy; - RAJA::forall(colorset, [=](int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - }); -// _raja_vertexarea_omp_end + RAJA::forall(colorset, + [=](int ie) + { + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + }); + // _raja_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, Nvert); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, Nvert); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration -// over segments, CUDA kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration + // over segments, CUDA kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// - camp::resources::Resource cuda_res{camp::resources::Cuda()}; + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // + camp::resources::Resource cuda_res {camp::resources::Cuda()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet cuda_colorset; - cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) ); - cuda_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), cuda_res) ); - cuda_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), cuda_res) ); - cuda_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), cuda_res) ); + cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res)); + cuda_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), cuda_res)); + cuda_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), cuda_res)); + cuda_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), cuda_res)); std::cout << "\n Running RAJA CUDA index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_cuda_start -// clang-format off + // _raja_vertexarea_cuda_start + // clang-format off using EXEC_POL2 = RAJA::ExecPolicy>; -// clang-format on - RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - }); -// _raja_vertexarea_cuda_end + // clang-format on + RAJA::forall(cuda_colorset, + [=] RAJA_DEVICE(int ie) + { + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + }); + // _raja_vertexarea_cuda_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); #endif -//----------------------------------------------------------------------------// -// RAJA HIP vertex sum calculation using IndexSet (sequential iteration -// over segments, HIP kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP vertex sum calculation using IndexSet (sequential iteration + // over segments, HIP kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -// -// Allocate and initialize device memory arrays -// + // + // Allocate and initialize device memory arrays + // double* d_areae = memoryManager::allocate_gpu(Nelem_tot); double* d_areav = memoryManager::allocate_gpu(Nvert_tot); - int* d_e2v_map = memoryManager::allocate_gpu(4*Nelem_tot); + int* d_e2v_map = memoryManager::allocate_gpu(4 * Nelem_tot); - hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice); - hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), + hipMemcpyHostToDevice); std::memset(areav, 0, Nvert_tot * sizeof(double)); - hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice); -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// - camp::resources::Resource hip_res{camp::resources::Hip()}; + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // + camp::resources::Resource hip_res {camp::resources::Hip()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet hip_colorset; - hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) ); + hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res)); std::cout << "\n Running RAJA HIP index set vertex sum...\n"; -// _raja_vertexarea_hip_start -// clang-format off + // _raja_vertexarea_hip_start + // clang-format off using EXEC_POL3 = RAJA::ExecPolicy>; -// clang-format on - RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(d_e2v_map[4*ie]); - d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[1] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[2] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[3] ] += d_areae[ie] / 4.0 ; - }); -// _raja_vertexarea_hip_end - - hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost); + // clang-format on + RAJA::forall(hip_colorset, + [=] RAJA_DEVICE(int ie) + { + int* iv = &(d_e2v_map[4 * ie]); + d_areav[iv[0]] += d_areae[ie] / 4.0; + d_areav[iv[1]] += d_areae[ie] / 4.0; + d_areav[iv[2]] += d_areae[ie] / 4.0; + d_areav[iv[3]] += d_areae[ie] / 4.0; + }); + // _raja_vertexarea_hip_end + + hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost); checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); memoryManager::deallocate_gpu(d_areae); memoryManager::deallocate_gpu(d_areav); @@ -402,7 +424,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(areae); @@ -421,12 +443,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(double* a, double* aref, int n) { bool correct = true; - for (int i = 0; i < n*n; i++) { - if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; } + for (int i = 0; i < n * n; i++) + { + if (correct && std::abs(a[i] - aref[i]) > 10e-12) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -437,11 +466,12 @@ void checkResult(double* a, double* aref, int n) void printMeshData(double* v, int n, int joff) { std::cout << std::endl; - for (int j = 0 ; j < n ; ++j) { - for (int i = 0 ; i < n ; ++i) { - int ii = i + j*joff ; - std::cout << "v(" << i << "," << j << ") = " - << v[ii] << std::endl; + for (int j = 0; j < n; ++j) + { + for (int i = 0; i < n; ++i) + { + int ii = i + j * joff; + std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl; } } std::cout << std::endl; diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp index 5b9601a21e..35c63726ee 100644 --- a/exercises/view-layout.cpp +++ b/exercises/view-layout.cpp @@ -22,9 +22,9 @@ * RAJA features shown: * - RAJA::View * - RAJA::Layout - * - Layout permutations + * - Layout permutations * - OffsetLayout - * - OffsetLayout permutations + * - OffsetLayout permutations * * NOTE: no RAJA kernel execution methods are used in these examples. */ @@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N); template void printValues(T* C, int N); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA view & layout exercises...\n"; -//----------------------------------------------------------------------------// -// -// Matrix-matrix multiplication: default layout -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Matrix-matrix multiplication: default layout + // + //----------------------------------------------------------------------------// // _matmult_init_start // @@ -58,95 +58,103 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate storage for matrices and initialize matrix entries // - double *A = new double[ N * N ]; - double *B = new double[ N * N ]; - double *C = new double[ N * N ]; - double *Cref = new double[ N * N ]; - - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - A[ col + N*row ] = row + 1; - B[ col + N*row ] = col + 1; - C[ col + N*row ] = 0.0; - Cref[ col + N*row ] = 0.0; + double* A = new double[N * N]; + double* B = new double[N * N]; + double* C = new double[N * N]; + double* Cref = new double[N * N]; + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + A[col + N * row] = row + 1; + B[col + N * row] = col + 1; + C[col + N * row] = 0.0; + Cref[col + N * row] = 0.0; } } // _matmult_init_end -//printValues(A, N*N); -//printValues(B, N*N); -//printValues(C, N*N); -//printValues(Cref, N*N); + // printValues(A, N*N); + // printValues(B, N*N); + // printValues(C, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication reference solution...\n"; // _cstyle_matmult_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { - Cref[col + N*row] += A[k + N*row] * B[col + N*k]; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { + Cref[col + N * row] += A[k + N * row] * B[col + N * k]; } } } // _cstyle_matmult_end -//printValues(Cref, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication w/Views...\n"; - // + // // Define RAJA View objects to simplify access to the matrix entries. - // - // Note: we use default Layout + // + // Note: we use default Layout // // _matmult_views_start - RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N); + RAJA::View> Aview(A, N, N); + RAJA::View> Bview(B, N, N); + RAJA::View> Cview(C, N, N); // _matmult_views_end // _cstyle_matmult_views_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { Cview(row, col) += Aview(row, k) * Bview(k, col); } } } // _cstyle_matmult_views_end - checkResult(C, Cref, N*N); -//printValues(C, N*N); + checkResult(C, Cref, N * N); + // printValues(C, N*N); -// -// Clean up. -// - delete [] A; - delete [] B; - delete [] C; - delete [] Cref; + // + // Clean up. + // + delete[] A; + delete[] B; + delete[] C; + delete[] Cref; -//----------------------------------------------------------------------------// -// -// Default layouts use row-major data ordering -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Default layouts use row-major data ordering + // + //----------------------------------------------------------------------------// // // Define dimensions and allocate arrays // // _default_views_init_start - constexpr int Nx = 3; - constexpr int Ny = 5; - constexpr int Nz = 2; - constexpr int Ntot = Nx*Ny*Nz; - int* a = new int[ Ntot ]; - int* aref = new int[ Ntot ]; + constexpr int Nx = 3; + constexpr int Ny = 5; + constexpr int Nz = 2; + constexpr int Ntot = Nx * Ny * Nz; + int* a = new int[Ntot]; + int* aref = new int[Ntot]; for (int i = 0; i < Ntot; ++i) { @@ -154,49 +162,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_views_init_end -//printValues(ref, Ntot); + // printValues(ref, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n Running default layout view cases...\n"; std::cout << "\n\t Running 1D view case...\n"; - + std::memset(a, 0, Ntot * sizeof(int)); - - // _default_view1D_start - RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot); - for (int i = 0; i < Ntot; ++i) { + // _default_view1D_start + RAJA::View> view_1D(a, Ntot); + + for (int i = 0; i < Ntot; ++i) + { view_1D(i) = i; } - // _default_view1D_end + // _default_view1D_end checkResult(a, aref, Ntot); -//printValues(a, Ntot); + // printValues(a, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); - + // _default_view2D_start - RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny); + RAJA::View> view_2D(a, Nx, Ny); - int iter{0}; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + int iter {0}; + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { view_2D(i, j) = iter; ++iter; } } // _default_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default layout view case...\n"; @@ -205,23 +216,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement a triple loop nest using a RAJA::View and + /// EXERCISE: Implement a triple loop nest using a RAJA::View and /// three-dimensional RAJA::Layout that iterates over the /// data array 'a' with unit stride. /// - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------------------------------------------// -// -// Permuted layouts change the data striding order -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Permuted layouts change the data striding order + // + //----------------------------------------------------------------------------// std::cout << "\n Running permuted layout cases...\n"; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default permutation view case...\n"; @@ -229,23 +240,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _default_perm_view2D_start std::array defperm2 {{0, 1}}; - RAJA::Layout< 2, int > defperm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2); - RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout); + RAJA::Layout<2, int> defperm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, defperm2); + RAJA::View> defperm_view_2D(a, defperm2_layout); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { defperm_view_2D(i, j) = iter; ++iter; } } // _default_perm_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default permutation view case...\n"; @@ -258,11 +271,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// three-dimensional RAJA::Layout with the identity permutation. /// - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// -//----------------------------------------// + //----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted layout view case...\n"; @@ -270,23 +283,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _perm_2D_start std::array perm2 {{1, 0}}; - RAJA::Layout< 2, int > perm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, perm2); - RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout); + RAJA::Layout<2, int> perm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, perm2); + RAJA::View> perm_view_2D(a, perm2_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int i = 0; i < Nx; ++i) + { perm_view_2D(i, j) = iter; ++iter; } } // _perm_2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D perma layout view case...\n"; @@ -297,7 +312,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement a triple loop nest using a RAJA::View and /// three-dimensional RAJA::Layout with the permutation - /// {2, 1, 0}. + /// {2, 1, 0}. /// /// Name the Layout object 'perm3a_layout' so it can be used /// with the index conversion methods in the section below. @@ -305,10 +320,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// Layout object you create here. /// - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D permb layout view case...\n"; @@ -316,14 +331,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _permb_view3D_start std::array perm3b {{1, 2, 0}}; - RAJA::Layout< 3, int > perm3b_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b); - RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout); + RAJA::Layout<3, int> perm3b_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b); + RAJA::View> perm3b_view_3D(a, perm3b_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { + for (int i = 0; i < Nx; ++i) + { perm3b_view_3D(i, j, k) = iter; ++iter; } @@ -331,29 +349,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _permb_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -// -// Clean up. -// - delete [] a; - delete [] aref; + // + // Clean up. + // + delete[] a; + delete[] aref; -//----------------------------------------------------------------------------// -// -// Layouts: multi-dimensional indices vs. linear indicies -// -// RAJA::Layout type has methods that can be used to convert between -// multi-dimensional and linear indices. We show these below using the -// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz -// sizes defined earlier: -// -// constexpr int Nx = 3; -// constexpr int Ny = 5; -// constexpr int Nz = 2; -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Layouts: multi-dimensional indices vs. linear indicies + // + // RAJA::Layout type has methods that can be used to convert between + // multi-dimensional and linear indices. We show these below using the + // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz + // sizes defined earlier: + // + // constexpr int Nx = 3; + // constexpr int Ny = 5; + // constexpr int Nz = 2; + // + //----------------------------------------------------------------------------// std::cout << "\n Multi-dimensional indices to linear indices...\n"; @@ -361,44 +379,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\nperm3a_layout...\n" << std::endl; int lin = -1; - int i = -1; - int j = -1; - int k = -1; + int i = -1; + int j = -1; + int k = -1; -/* - // _perm3d_layout_start - lin = perm3a_layout(1, 2, 0); - std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl; - std::cout << "\t Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny " - << "(since perm is {2, 1, 0})" << std::endl; + /* + // _perm3d_layout_start + lin = perm3a_layout(1, 2, 0); + std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl; + std::cout << "\t Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; - perm3a_layout.toIndices(7, i, j, k); - std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; - // _perm3d_layout_end + perm3a_layout.toIndices(7, i, j, k); + std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + // _perm3d_layout_end - lin = perm3a_layout(2, 3, 1); - std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl; - std::cout << "\t Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny " - << "(since perm is {2, 1, 0})" << std::endl; + lin = perm3a_layout(2, 3, 1); + std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl; + std::cout << "\t Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; - perm3a_layout.toIndices(26, i, j, k); - std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + perm3a_layout.toIndices(26, i, j, k); + std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; - lin = perm3a_layout(0, 2, 1); - std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl; - std::cout << "\t Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny " - << "(since perm is {2, 1, 0})" << std::endl; + lin = perm3a_layout(0, 2, 1); + std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl; + std::cout << "\t Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; - perm3a_layout.toIndices(21, i, j, k); - std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; -*/ + perm3a_layout.toIndices(21, i, j, k); + std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + */ -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\nperm3b_layout...\n" << std::endl; @@ -409,7 +427,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(13, i, j, k); std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(2, 3, 1); @@ -419,7 +438,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(23, i, j, k); std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(0, 2, 1); @@ -428,7 +448,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) << "(since perm is {1, 2, 0})" << std::endl; perm3b_layout.toIndices(15, i, j, k); std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; /// /// TODO... @@ -438,11 +459,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// data array 'a' with unit stride. /// -//----------------------------------------------------------------------------// -// -// Offset layouts apply offsets to indices -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Offset layouts apply offsets to indices + // + //----------------------------------------------------------------------------// std::cout << "\n Running offset layout cases...\n"; @@ -450,10 +471,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define some dimensions, and allocate arrays // constexpr int Ntot_ao = 40; - int* ao = new int[ Ntot_ao ]; - int* ao_ref = new int[ Ntot_ao ]; + int* ao = new int[Ntot_ao]; + int* ao_ref = new int[Ntot_ao]; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 1D offset layout case...\n"; @@ -467,37 +488,39 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int imin = -5; int imax = 6; - for (int i = imin; i < imax; ++i) { - ao_ref[ i-imin ] = i; + for (int i = imin; i < imax; ++i) + { + ao_ref[i - imin] = i; } // _cstyle_offlayout1D_end -//printValues(ao_ref, imax-imin); + // printValues(ao_ref, imax-imin); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout1D_start -// clang-format off + // clang-format off RAJA::OffsetLayout<1, int> offlayout_1D = - RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); + RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, offlayout_1D); -// clang-format on - for (int i = imin; i < imax; ++i) { + // clang-format on + for (int i = imin; i < imax; ++i) + { aoview_1Doff(i) = i; } // _raja_offlayout1D_end - checkResult(ao, ao_ref, imax-imin); -//printValues(ao, 11); + checkResult(ao, ao_ref, imax - imin); + // printValues(ao, 11); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D offset layout case...\n"; @@ -508,23 +531,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(ao_ref, 0, Ntot_ao * sizeof(int)); // _cstyle_offlayout2D_start - imin = -1; - imax = 2; + imin = -1; + imax = 2; int jmin = -5; int jmax = 5; iter = 0; - for (int i = imin; i < imax; ++i) { - for (int j = jmin; j < jmax; ++j) { - ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin) ] = iter; + for (int i = imin; i < imax; ++i) + { + for (int j = jmin; j < jmax; ++j) + { + ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter; iter++; } } // _cstyle_offlayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); @@ -536,10 +561,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// same operations as the C-style example above. /// - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted offset layout case...\n"; @@ -551,54 +576,58 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _cstyle_permofflayout2D_start iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - ao_ref[ (i-imin) + (j-jmin) * (imax-imin) ] = iter; + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter; iter++; } } // _cstyle_permofflayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_permofflayout2D_start std::array perm1D {{1, 0}}; -// clang-format off + // clang-format off RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, {{imax, jmax}}, perm1D ); -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, permofflayout_2D); -// clang-format on + // clang-format on iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { aoview_2Dpermoff(i, j) = iter; iter++; } } // _raja_permofflayout2D_end - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -// -// Clean up. -// - delete [] ao; - delete [] ao_ref; + // + // Clean up. + // + delete[] ao; + delete[] ao_ref; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp index 26316bc2c7..54db5c52d8 100644 --- a/exercises/view-layout_solution.cpp +++ b/exercises/view-layout_solution.cpp @@ -22,9 +22,9 @@ * RAJA features shown: * - RAJA::View * - RAJA::Layout - * - Layout permutations + * - Layout permutations * - OffsetLayout - * - OffsetLayout permutations + * - OffsetLayout permutations * * NOTE: no RAJA kernel execution methods are used in these examples. */ @@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N); template void printValues(T* C, int N); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA view & layout exercises...\n"; -//----------------------------------------------------------------------------// -// -// Matrix-matrix multiplication: default layout -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Matrix-matrix multiplication: default layout + // + //----------------------------------------------------------------------------// // _matmult_init_start // @@ -58,95 +58,103 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate storage for matrices and initialize matrix entries // - double *A = new double[ N * N ]; - double *B = new double[ N * N ]; - double *C = new double[ N * N ]; - double *Cref = new double[ N * N ]; - - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - A[ col + N*row ] = row + 1; - B[ col + N*row ] = col + 1; - C[ col + N*row ] = 0.0; - Cref[ col + N*row ] = 0.0; + double* A = new double[N * N]; + double* B = new double[N * N]; + double* C = new double[N * N]; + double* Cref = new double[N * N]; + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + A[col + N * row] = row + 1; + B[col + N * row] = col + 1; + C[col + N * row] = 0.0; + Cref[col + N * row] = 0.0; } } // _matmult_init_end -//printValues(A, N*N); -//printValues(B, N*N); -//printValues(C, N*N); -//printValues(Cref, N*N); + // printValues(A, N*N); + // printValues(B, N*N); + // printValues(C, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication reference solution...\n"; // _cstyle_matmult_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { - Cref[col + N*row] += A[k + N*row] * B[col + N*k]; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { + Cref[col + N * row] += A[k + N * row] * B[col + N * k]; } } } // _cstyle_matmult_end -//printValues(Cref, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication w/Views...\n"; - // + // // Define RAJA View objects to simplify access to the matrix entries. - // - // Note: we use default Layout + // + // Note: we use default Layout // // _matmult_views_start - RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N); + RAJA::View> Aview(A, N, N); + RAJA::View> Bview(B, N, N); + RAJA::View> Cview(C, N, N); // _matmult_views_end // _cstyle_matmult_views_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { Cview(row, col) += Aview(row, k) * Bview(k, col); } } } // _cstyle_matmult_views_end - checkResult(C, Cref, N*N); -//printValues(C, N*N); + checkResult(C, Cref, N * N); + // printValues(C, N*N); -// -// Clean up. -// - delete [] A; - delete [] B; - delete [] C; - delete [] Cref; + // + // Clean up. + // + delete[] A; + delete[] B; + delete[] C; + delete[] Cref; -//----------------------------------------------------------------------------// -// -// Default layouts use row-major data ordering -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Default layouts use row-major data ordering + // + //----------------------------------------------------------------------------// // // Define dimensions and allocate arrays // // _default_views_init_start - constexpr int Nx = 3; - constexpr int Ny = 5; - constexpr int Nz = 2; - constexpr int Ntot = Nx*Ny*Nz; - int* a = new int[ Ntot ]; - int* aref = new int[ Ntot ]; + constexpr int Nx = 3; + constexpr int Ny = 5; + constexpr int Nz = 2; + constexpr int Ntot = Nx * Ny * Nz; + int* a = new int[Ntot]; + int* aref = new int[Ntot]; for (int i = 0; i < Ntot; ++i) { @@ -154,61 +162,67 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_views_init_end -//printValues(ref, Ntot); + // printValues(ref, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n Running default layout view cases...\n"; std::cout << "\n\t Running 1D view case...\n"; - + std::memset(a, 0, Ntot * sizeof(int)); - - // _default_view1D_start - RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot); - for (int i = 0; i < Ntot; ++i) { + // _default_view1D_start + RAJA::View> view_1D(a, Ntot); + + for (int i = 0; i < Ntot; ++i) + { view_1D(i) = i; } - // _default_view1D_end + // _default_view1D_end checkResult(a, aref, Ntot); -//printValues(a, Ntot); + // printValues(a, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); - + // _default_view2D_start - RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny); + RAJA::View> view_2D(a, Nx, Ny); - int iter{0}; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + int iter {0}; + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { view_2D(i, j) = iter; ++iter; } } // _default_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); - // _default_view3D_start - RAJA::View< int, RAJA::Layout<3, int> > view_3D(a, Nx, Ny, Nz); + // _default_view3D_start + RAJA::View> view_3D(a, Nx, Ny, Nz); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { view_3D(i, j, k) = iter; ++iter; } @@ -216,18 +230,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------------------------------------------// -// -// Permuted layouts change the data striding order -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Permuted layouts change the data striding order + // + //----------------------------------------------------------------------------// std::cout << "\n Running permuted layout cases...\n"; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default permutation view case...\n"; @@ -235,23 +249,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _default_perm_view2D_start std::array defperm2 {{0, 1}}; - RAJA::Layout< 2, int > defperm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2); - RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout); + RAJA::Layout<2, int> defperm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, defperm2); + RAJA::View> defperm_view_2D(a, defperm2_layout); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { defperm_view_2D(i, j) = iter; ++iter; } } // _default_perm_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default permutation view case...\n"; @@ -259,14 +275,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _default_perm_view3D_start std::array defperm3 {{0, 1, 2}}; - RAJA::Layout< 3, int > defperm3_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, defperm3); - RAJA::View< int, RAJA::Layout<3, int> > defperm_view_3D(a, defperm3_layout); + RAJA::Layout<3, int> defperm3_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, defperm3); + RAJA::View> defperm_view_3D(a, defperm3_layout); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { defperm_view_3D(i, j, k) = iter; ++iter; } @@ -274,11 +293,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_perm_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// -//----------------------------------------// + //----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted layout view case...\n"; @@ -286,23 +305,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _perm_2D_start std::array perm2 {{1, 0}}; - RAJA::Layout< 2, int > perm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, perm2); - RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout); + RAJA::Layout<2, int> perm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, perm2); + RAJA::View> perm_view_2D(a, perm2_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int i = 0; i < Nx; ++i) + { perm_view_2D(i, j) = iter; ++iter; } } // _perm_2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D perma layout view case...\n"; @@ -310,14 +331,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _perma_view3D_start std::array perm3a {{2, 1, 0}}; - RAJA::Layout< 3, int > perm3a_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3a); - RAJA::View< int, RAJA::Layout<3, int> > perm3a_view_3D(a, perm3a_layout); + RAJA::Layout<3, int> perm3a_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3a); + RAJA::View> perm3a_view_3D(a, perm3a_layout); iter = 0; - for (int k = 0; k < Nz; ++k) { - for (int j = 0; j < Ny; ++j) { - for (int i = 0; i < Nx; ++i) { + for (int k = 0; k < Nz; ++k) + { + for (int j = 0; j < Ny; ++j) + { + for (int i = 0; i < Nx; ++i) + { perm3a_view_3D(i, j, k) = iter; ++iter; } @@ -325,10 +349,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _perma_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D permb layout view case...\n"; @@ -336,14 +360,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _permb_view3D_start std::array perm3b {{1, 2, 0}}; - RAJA::Layout< 3, int > perm3b_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b); - RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout); + RAJA::Layout<3, int> perm3b_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b); + RAJA::View> perm3b_view_3D(a, perm3b_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { + for (int i = 0; i < Nx; ++i) + { perm3b_view_3D(i, j, k) = iter; ++iter; } @@ -351,29 +378,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _permb_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -// -// Clean up. -// - delete [] a; - delete [] aref; + // + // Clean up. + // + delete[] a; + delete[] aref; -//----------------------------------------------------------------------------// -// -// Layouts: multi-dimensional indices vs. linear indicies -// -// RAJA::Layout type has methods that can be used to convert between -// multi-dimensional and linear indices. We show these below using the -// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz -// sizes defined earlier: -// -// constexpr int Nx = 3; -// constexpr int Ny = 5; -// constexpr int Nz = 2; -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Layouts: multi-dimensional indices vs. linear indicies + // + // RAJA::Layout type has methods that can be used to convert between + // multi-dimensional and linear indices. We show these below using the + // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz + // sizes defined earlier: + // + // constexpr int Nx = 3; + // constexpr int Ny = 5; + // constexpr int Nz = 2; + // + //----------------------------------------------------------------------------// std::cout << "\n Multi-dimensional indices to linear indices...\n"; @@ -381,9 +408,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\nperm3a_layout...\n" << std::endl; int lin = -1; - int i = -1; - int j = -1; - int k = -1; + int i = -1; + int j = -1; + int k = -1; // _perm3d_layout_start lin = perm3a_layout(1, 2, 0); @@ -393,7 +420,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3a_layout.toIndices(7, i, j, k); std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; // _perm3d_layout_end @@ -404,7 +432,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3a_layout.toIndices(26, i, j, k); std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3a_layout(0, 2, 1); @@ -414,9 +443,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3a_layout.toIndices(21, i, j, k); std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\nperm3b_layout...\n" << std::endl; @@ -427,7 +457,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(13, i, j, k); std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(2, 3, 1); @@ -437,7 +468,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(23, i, j, k); std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(0, 2, 1); @@ -447,13 +479,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(15, i, j, k); std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; -//----------------------------------------------------------------------------// -// -// Offset layouts apply offsets to indices -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Offset layouts apply offsets to indices + // + //----------------------------------------------------------------------------// std::cout << "\n Running offset layout cases...\n"; @@ -461,10 +494,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define some dimensions, and allocate arrays // constexpr int Ntot_ao = 40; - int* ao = new int[ Ntot_ao ]; - int* ao_ref = new int[ Ntot_ao ]; + int* ao = new int[Ntot_ao]; + int* ao_ref = new int[Ntot_ao]; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 1D offset layout case...\n"; @@ -478,37 +511,39 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int imin = -5; int imax = 6; - for (int i = imin; i < imax; ++i) { - ao_ref[ i-imin ] = i; + for (int i = imin; i < imax; ++i) + { + ao_ref[i - imin] = i; } // _cstyle_offlayout1D_end -//printValues(ao_ref, imax-imin); + // printValues(ao_ref, imax-imin); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout1D_start -// clang-format off + // clang-format off RAJA::OffsetLayout<1, int> offlayout_1D = - RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); + RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, offlayout_1D); -// clang-format on - for (int i = imin; i < imax; ++i) { + // clang-format on + for (int i = imin; i < imax; ++i) + { aoview_1Doff(i) = i; } // _raja_offlayout1D_end - checkResult(ao, ao_ref, imax-imin); -//printValues(ao, 11); + checkResult(ao, ao_ref, imax - imin); + // printValues(ao, 11); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D offset layout case...\n"; @@ -519,49 +554,53 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(ao_ref, 0, Ntot_ao * sizeof(int)); // _cstyle_offlayout2D_start - imin = -1; - imax = 2; + imin = -1; + imax = 2; int jmin = -5; int jmax = 5; iter = 0; - for (int i = imin; i < imax; ++i) { - for (int j = jmin; j < jmax; ++j) { - ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin) ] = iter; + for (int i = imin; i < imax; ++i) + { + for (int j = jmin; j < jmax; ++j) + { + ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter; iter++; } } // _cstyle_offlayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout2D_start -// clang-format off + // clang-format off RAJA::OffsetLayout<2, int> offlayout_2D = RAJA::make_offset_layout<2, int>( {{imin, jmin}}, {{imax, jmax}} ); -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::View< int, RAJA::OffsetLayout<2, int> > aoview_2Doff(ao, offlayout_2D); iter = 0; -// clang-format on - for (int i = imin; i < imax; ++i) { - for (int j = jmin; j < jmax; ++j) { + // clang-format on + for (int i = imin; i < imax; ++i) + { + for (int j = jmin; j < jmax; ++j) + { aoview_2Doff(i, j) = iter; iter++; } } // _raja_offlayout2D_end - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted offset layout case...\n"; @@ -573,54 +612,58 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _cstyle_permofflayout2D_start iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - ao_ref[ (i-imin) + (j-jmin) * (imax-imin) ] = iter; + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter; iter++; } } // _cstyle_permofflayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_permofflayout2D_start std::array perm1D {{1, 0}}; -// clang-format off + // clang-format off RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, {{imax, jmax}}, perm1D ); -// clang-format on -// clang-format off + // clang-format on + // clang-format off RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, permofflayout_2D); -// clang-format on + // clang-format on iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { aoview_2Dpermoff(i, j) = iter; iter++; } } // _raja_permofflayout2D_end - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -// -// Clean up. -// - delete [] ao; - delete [] ao_ref; + // + // Clean up. + // + delete[] ao; + delete[] ao_ref; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; From 65fd2d47db4c07c6619d3a734b2c058ad24dfdf8 Mon Sep 17 00:00:00 2001 From: john bowen Date: Tue, 5 Nov 2024 11:31:19 -0800 Subject: [PATCH 5/5] Clang format --- .clang-format | 7 +- benchmark/benchmark-atomic.cpp | 626 ++-- benchmark/host-device-lambda-benchmark.cpp | 23 +- benchmark/ltimes.cpp | 2987 ++++++++++---------- benchmark/raja_view_blur.cpp | 231 +- cmake/RAJAMacros.cmake | 16 +- 6 files changed, 1988 insertions(+), 1902 deletions(-) diff --git a/.clang-format b/.clang-format index b6fa54b233..898f791bf7 100644 --- a/.clang-format +++ b/.clang-format @@ -31,7 +31,7 @@ BraceWrapping: AfterExternBlock: false BeforeCatch: true BeforeElse: true - BeforeLambdaBody: true + BeforeLambdaBody: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false @@ -40,12 +40,13 @@ BraceWrapping: # Pointer alignment DerivePointerAlignment: false PointerAlignment: Left + +# Single line config AllowShortIfStatementsOnASingleLine : true AllowShortFunctionsOnASingleLine : true AllowShortLoopsOnASingleLine : false AllowAllArgumentsOnNextLine : true AllowAllParametersOfDeclarationOnNextLine : false -AlignTrailingComments : true BinPackArguments : true BinPackParameters : false ConstructorInitializerAllOnOneLineOrOnePerLine : true @@ -67,4 +68,4 @@ SpacesInCStyleCastParentheses: false SpacesInContainerLiterals: false SpacesInConditionalStatement: false SpacesInParentheses: false -SpacesInSquareBrackets: false +SpacesInSquareBrackets: false \ No newline at end of file diff --git a/benchmark/benchmark-atomic.cpp b/benchmark/benchmark-atomic.cpp index ebe3a858ff..ecb316b451 100644 --- a/benchmark/benchmark-atomic.cpp +++ b/benchmark/benchmark-atomic.cpp @@ -5,15 +5,16 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // This file is intended to provide an interface for comparing the performance -// of RAJA's atomic implementations with Desul's atomic implementations. In order -// to accomplish this without requiring two separate build system configurations -// this file directly includes "desul/atomics.hpp" and directly calls desul namespace -// atomics. This is different from how a typical RAJA user would call a desul atomic. +// of RAJA's atomic implementations with Desul's atomic implementations. In +// order to accomplish this without requiring two separate build system +// configurations this file directly includes "desul/atomics.hpp" and directly +// calls desul namespace atomics. This is different from how a typical RAJA +// user would call a desul atomic. #include "RAJA/RAJA.hpp" #include "RAJA/util/for_each.hpp" #include "RAJA/util/Timer.hpp" -#if defined (RAJA_ENABLE_OPENMP) +#if defined(RAJA_ENABLE_OPENMP) #include "RAJA/policy/openmp/atomic.hpp" #endif #include "desul/atomics.hpp" @@ -25,85 +26,99 @@ /// Conditional compilation for CUDA benchmarks. -#if defined (RAJA_ENABLE_CUDA) +#if defined(RAJA_ENABLE_CUDA) #include "RAJA/policy/cuda/atomic.hpp" -template -struct ExecPolicyGPU { - using policy = RAJA::cuda_exec; - static std::string PolicyName() { - std::stringstream ss; - ss << "CUDA execution with block size " << BLOCK_SZ; - return ss.str(); - } +template +struct ExecPolicyGPU +{ + using policy = RAJA::cuda_exec; + static std::string PolicyName() + { + std::stringstream ss; + ss << "CUDA execution with block size " << BLOCK_SZ; + return ss.str(); + } }; -struct GPUAtomic { - using policy = RAJA::policy::cuda::cuda_atomic; +struct GPUAtomic +{ + using policy = RAJA::policy::cuda::cuda_atomic; }; -#elif defined (RAJA_ENABLE_HIP) +#elif defined(RAJA_ENABLE_HIP) #include "RAJA/policy/hip/atomic.hpp" -template -struct ExecPolicyGPU { - using policy = RAJA::hip_exec; - static std::string PolicyName() { - std::stringstream ss; - ss << "HIP execution with block size " << BLOCK_SZ; - return ss.str(); - } +template +struct ExecPolicyGPU +{ + using policy = RAJA::hip_exec; + static std::string PolicyName() + { + std::stringstream ss; + ss << "HIP execution with block size " << BLOCK_SZ; + return ss.str(); + } }; -struct GPUAtomic { - using policy = RAJA::policy::hip::hip_atomic; +struct GPUAtomic +{ + using policy = RAJA::policy::hip::hip_atomic; }; #endif #define BLOCK_SZ 256 -#define INDENT " " +#define INDENT " " using raja_default_desul_order = desul::MemoryOrderRelaxed; using raja_default_desul_scope = desul::MemoryScopeDevice; -// Desul atomics have a different signature than RAJA's built in ops. The following code provides some -// helper function templates so that they can be called using the same signature in timing code. +// Desul atomics have a different signature than RAJA's built in ops. The +// following code provides some helper function templates so that they can be +// called using the same signature in timing code. // Struct holding Desul atomic signature typedef -template -struct DesulAtomicSignature { - using type = ReturnType(*)(Args..., raja_default_desul_order, raja_default_desul_scope); +template +struct DesulAtomicSignature +{ + using type = ReturnType (*)(Args..., + raja_default_desul_order, + raja_default_desul_scope); }; // Struct holding RAJA atomic signature typedef -template -struct RajaAtomicSignature { - using type = AtomicType(*)(AtomicType*, const AtomicType); +template +struct RajaAtomicSignature +{ + using type = AtomicType (*)(AtomicType*, const AtomicType); }; -/// RAJA::atomicAdd and other RAJA namespace atomic calls are overloaded and have an ambiguous type -/// so they can't be passed as a template parameter. -/// The following macro disambiguates the call and provide a signature comaptible with the DESUL -/// wrapper. AtomicOperation must be a valid RAJA namespace atomic operation, like atomicAdd, -/// atomicMax, etc. -#define OPERATOR_CALL_BINARY(AtomicOperation) \ - template \ - RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType val) const { \ - return RAJA::AtomicOperation(Policy {}, acc, val); \ - } \ - -#define OPERATOR_CALL_UNARY(AtomicOperation) \ - template \ - RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType) const { \ - return RAJA::AtomicOperation(Policy {}, acc); \ - } \ - -#define DECLARE_ATOMIC_WRAPPER(AtomicFunctorName, AtomicOperatorDeclaration) \ -template \ -struct AtomicFunctorName { \ - const char* name = #AtomicFunctorName ; \ - AtomicOperatorDeclaration \ -}; \ +/// RAJA::atomicAdd and other RAJA namespace atomic calls are overloaded and +/// have an ambiguous type so they can't be passed as a template parameter. The +/// following macro disambiguates the call and provide a signature comaptible +/// with the DESUL wrapper. AtomicOperation must be a valid RAJA namespace +/// atomic operation, like atomicAdd, atomicMax, etc. +#define OPERATOR_CALL_BINARY(AtomicOperation) \ + template \ + RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType val) const \ + { \ + return RAJA::AtomicOperation(Policy {}, acc, val); \ + } + +#define OPERATOR_CALL_UNARY(AtomicOperation) \ + template \ + RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType) const \ + { \ + return RAJA::AtomicOperation(Policy {}, acc); \ + } + +#define DECLARE_ATOMIC_WRAPPER(AtomicFunctorName, AtomicOperatorDeclaration) \ + template \ + struct AtomicFunctorName \ + { \ + const char* name = #AtomicFunctorName; \ + AtomicOperatorDeclaration \ + }; DECLARE_ATOMIC_WRAPPER(AtomicAdd, OPERATOR_CALL_BINARY(atomicAdd)) DECLARE_ATOMIC_WRAPPER(AtomicSub, OPERATOR_CALL_BINARY(atomicSub)) @@ -119,228 +134,365 @@ DECLARE_ATOMIC_WRAPPER(AtomicXor, OPERATOR_CALL_BINARY(atomicXor)) DECLARE_ATOMIC_WRAPPER(AtomicExchange, OPERATOR_CALL_BINARY(atomicExchange)) DECLARE_ATOMIC_WRAPPER(AtomicLoad, OPERATOR_CALL_UNARY(atomicLoad)) -/// Instead of complicating the above macro to handle these two atomics, do the declarations -/// manually below. -template -struct AtomicStore { - const char* name = "AtomicStore"; - template - RAJA_HOST_DEVICE void operator()(ArgType* acc, const ArgType val) const { - return RAJA::atomicStore(Policy {}, acc, val); - } +/// Instead of complicating the above macro to handle these two atomics, do the +/// declarations manually below. +template +struct AtomicStore +{ + const char* name = "AtomicStore"; + template + RAJA_HOST_DEVICE void operator()(ArgType* acc, const ArgType val) const + { + return RAJA::atomicStore(Policy {}, acc, val); + } }; -template -struct AtomicCAS { - const char* name = "AtomicCAS"; - template - RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, ArgType compare) const { - return RAJA::atomicCAS(Policy {}, acc, compare, ArgType(1)); - } +template +struct AtomicCAS +{ + const char* name = "AtomicCAS"; + template + RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, ArgType compare) const + { + return RAJA::atomicCAS(Policy {}, acc, compare, ArgType(1)); + } }; /// ExecPolicy wrapper for OpenMP -struct ExecPolicyOMP { - using policy = RAJA::omp_parallel_for_exec;; - static std::string PolicyName() { - std::stringstream ss; - ss << "OpenMP execution"; - return ss.str(); - } +struct ExecPolicyOMP +{ + using policy = RAJA::omp_parallel_for_exec; + ; + static std::string PolicyName() + { + std::stringstream ss; + ss << "OpenMP execution"; + return ss.str(); + } }; -/// Functor wrapping the desul implementation. Wrapping the desul call ensure an identical signature with -/// RAJA's implementations. Wrapping the call in an functor allows simple type deduction for printing -/// from within the benchmark. -template::type atomic_impl> -struct atomicWrapperDesulTernary { - /// Call operator overload template that allows invoking DESUL atomic with a (int*)(T*, T) signature - RAJA_HOST_DEVICE ReturnType operator()(T * acc, T value) const { - return atomic_impl(acc, value, T(1), raja_default_desul_order{}, - raja_default_desul_scope{}); - } +/// Functor wrapping the desul implementation. Wrapping the desul call ensure +/// an identical signature with RAJA's implementations. Wrapping the call in an +/// functor allows simple type deduction for printing from within the benchmark. +template < + typename T, + typename ReturnType, + typename DesulAtomicSignature::type atomic_impl> +struct atomicWrapperDesulTernary +{ + /// Call operator overload template that allows invoking DESUL atomic with a + /// (int*)(T*, T) signature + RAJA_HOST_DEVICE ReturnType operator()(T* acc, T value) const + { + return atomic_impl(acc, value, T(1), raja_default_desul_order {}, + raja_default_desul_scope {}); + } }; -template::type atomic_impl> -struct atomicWrapperDesulBinary { - /// Call operator overload template that allows invoking DESUL atomic with a (int*)(T*, T) signature - RAJA_HOST_DEVICE ReturnType operator()(T * acc, T value) const { - return atomic_impl(acc, value, raja_default_desul_order{}, - raja_default_desul_scope{}); - } +template < + typename T, + typename ReturnType, + typename DesulAtomicSignature::type atomic_impl> +struct atomicWrapperDesulBinary +{ + /// Call operator overload template that allows invoking DESUL atomic with a + /// (int*)(T*, T) signature + RAJA_HOST_DEVICE ReturnType operator()(T* acc, T value) const + { + return atomic_impl(acc, value, raja_default_desul_order {}, + raja_default_desul_scope {}); + } }; /// Unary wrapper variant for increment and decrement benchmarks. -template::type atomic_impl> -struct atomicWrapperDesulUnary { - RAJA_HOST_DEVICE ReturnType operator()(T* acc, T) const { - return atomic_impl(acc, raja_default_desul_order{}, - raja_default_desul_scope{}); - } +template ::type atomic_impl> +struct atomicWrapperDesulUnary +{ + RAJA_HOST_DEVICE ReturnType operator()(T* acc, T) const + { + return atomic_impl(acc, raja_default_desul_order {}, + raja_default_desul_scope {}); + } }; -template -class IsDesul : public std::false_type {}; - -template::type atomic_impl> -class IsDesul> : public std::true_type {}; - -template::type atomic_impl> -class IsDesul> : public std::true_type {}; - -template::type atomic_impl> -class IsDesul> : public std::true_type {}; - -template -std::string GetImplName (const AtomicImplType& impl) { - if (IsDesul::value) { - return "Desul atomic"; - } else { - return "RAJA atomic"; - } +template +class IsDesul : public std::false_type +{}; + +template ::type atomic_impl> +class IsDesul> + : public std::true_type +{}; + +template ::type atomic_impl> +class IsDesul> + : public std::true_type +{}; + +template ::type atomic_impl> +class IsDesul> + : public std::true_type +{}; + +template +std::string GetImplName(const AtomicImplType& impl) +{ + if (IsDesul::value) + { + return "Desul atomic"; + } + else + { + return "RAJA atomic"; + } } -template -void TimeAtomicOp(const AtomicImplType& atomic_impl, uint64_t N, uint64_t num_iterations = 4, int array_size = 100, bool print_to_output = true) { - RAJA::Timer timer; - - // Allocate memory - AtomicType* device_value = nullptr; - int len_array = test_array ? array_size : 1; - camp::resources::Resource resource {RAJA::resources::get_resource::type::get_default()}; - device_value = resource.allocate(len_array); - - timer.start(); - if (test_array) { - for (uint64_t i = 0; i < num_iterations; ++i) { - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=] RAJA_HOST_DEVICE(uint64_t tid) { - atomic_impl(&(device_value[tid % array_size]), AtomicType(1)); - }); - } - } else { - for (uint64_t i = 0; i < num_iterations; ++i) { - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=] RAJA_HOST_DEVICE(uint64_t tid) { - atomic_impl(device_value, AtomicType(1)); - }); - } +template +void TimeAtomicOp(const AtomicImplType& atomic_impl, + uint64_t N, + uint64_t num_iterations = 4, + int array_size = 100, + bool print_to_output = true) +{ + RAJA::Timer timer; + + // Allocate memory + AtomicType* device_value = nullptr; + int len_array = test_array ? array_size : 1; + camp::resources::Resource resource {RAJA::resources::get_resource< + typename ExecPolicy::policy>::type::get_default()}; + device_value = resource.allocate(len_array); + + timer.start(); + if (test_array) + { + for (uint64_t i = 0; i < num_iterations; ++i) + { + RAJA::forall( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_HOST_DEVICE(uint64_t tid) + { atomic_impl(&(device_value[tid % array_size]), AtomicType(1)); }); } - - resource.wait(); - timer.stop(); - resource.deallocate(device_value); - - double t = timer.elapsed(); - if (print_to_output) { - std::cout << INDENT << INDENT << t << "s" << INDENT; - std::cout << GetImplName(atomic_impl) << ", "; - if (test_array) { - std::cout << "Number of atomics under contention " << array_size << ", "; - } - std::cout << num_iterations * N << " many atomic operations" << ", "; - std::cout << ExecPolicy::PolicyName(); - std::cout << std::endl; + } + else + { + for (uint64_t i = 0; i < num_iterations; ++i) + { + RAJA::forall( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_HOST_DEVICE(uint64_t tid) + { atomic_impl(device_value, AtomicType(1)); }); + } + } + + resource.wait(); + timer.stop(); + resource.deallocate(device_value); + + double t = timer.elapsed(); + if (print_to_output) + { + std::cout << INDENT << INDENT << t << "s" << INDENT; + std::cout << GetImplName(atomic_impl) << ", "; + if (test_array) + { + std::cout << "Number of atomics under contention " << array_size << ", "; } + std::cout << num_iterations * N << " many atomic operations" + << ", "; + std::cout << ExecPolicy::PolicyName(); + std::cout << std::endl; + } } -template +template struct list_concat; -template -struct list_concat, camp::list> { - using type = camp::list; +template +struct list_concat, camp::list> +{ + using type = camp::list; }; -/// Holder for atomic operations that work with arbitrary atomic type, e.g. double, float -/// and ints etc. -template -struct universal_atomic_ops { - using type = camp::list, AtomicAdd>, - std::pair, AtomicSub>, - std::pair, AtomicMin>, - std::pair, AtomicMax>, - std::pair, AtomicIncBinary>, - std::pair, AtomicDecBinary>, - std::pair, AtomicIncUnary>, - std::pair, AtomicDecUnary>, - std::pair, AtomicLoad>, - std::pair, AtomicStore>, - std::pair, AtomicExchange>, - std::pair, AtomicCAS>>; +/// Holder for atomic operations that work with arbitrary atomic type, e.g. +/// double, float and ints etc. +template +struct universal_atomic_ops +{ + using type = camp::list< + std::pair, + AtomicAdd>, + std::pair, + AtomicSub>, + std::pair, + AtomicMin>, + std::pair, + AtomicMax>, + std::pair, + AtomicIncBinary>, + std::pair, + AtomicDecBinary>, + std::pair, + AtomicIncUnary>, + std::pair, + AtomicDecUnary>, + std::pair, + AtomicLoad>, + std::pair< + atomicWrapperDesulBinary, + AtomicStore>, + std::pair, + AtomicExchange>, + std::pair, + AtomicCAS>>; }; -template -struct integral_atomic_ops { - using type = camp::list, AtomicAnd>, - std::pair, AtomicOr>, - std::pair, AtomicXor>>; +template +struct integral_atomic_ops +{ + using type = + camp::list, + AtomicAnd>, + std::pair, + AtomicOr>, + std::pair, + AtomicXor>>; }; -template +template struct atomic_ops; /// Include all atomic ops if the underlying atomic to benchmark is integral. -template -struct atomic_ops::value>::type> { - using type = typename list_concat::type, typename integral_atomic_ops::type>::type; +template +struct atomic_ops< + AtomicDataType, + Policy, + typename std::enable_if::value>::type> +{ + using type = typename list_concat< + typename universal_atomic_ops::type, + typename integral_atomic_ops::type>::type; }; /// Omit bitwise ops and, or, and xor for floating point types -template -struct atomic_ops::value>::type> { - using type = typename universal_atomic_ops< AtomicDataType, Policy >::type; +template +struct atomic_ops::value>::type> +{ + using type = typename universal_atomic_ops::type; }; -template -void ExecuteBenchmark(uint64_t N) { - using ops = atomic_ops; - using iter_t = typename ops::type; - auto iter = iter_t{}; - RAJA::for_each_type(iter, [&](auto type_pair) { +template +void ExecuteBenchmark(uint64_t N) +{ + using ops = atomic_ops; + using iter_t = typename ops::type; + auto iter = iter_t {}; + RAJA::for_each_type( + iter, + [&](auto type_pair) + { auto desul_functor = type_pair.first; - auto raja_functor = type_pair.second; - std::cout << INDENT << "Executing " << raja_functor.name << " integer benchmarks" << std::endl; - TimeAtomicOp(desul_functor, N, 100, 10000); - TimeAtomicOp(raja_functor, N, 100, 10000); - TimeAtomicOp(desul_functor, N, 10, 1000); - TimeAtomicOp(raja_functor, N, 10, 1000); + auto raja_functor = type_pair.second; + std::cout << INDENT << "Executing " << raja_functor.name + << " integer benchmarks" << std::endl; + TimeAtomicOp(desul_functor, N, 100, + 10000); + TimeAtomicOp(raja_functor, N, 100, + 10000); + TimeAtomicOp(desul_functor, N, 10, + 1000); + TimeAtomicOp(raja_functor, N, 10, + 1000); TimeAtomicOp(desul_functor, N, 4, 10); TimeAtomicOp(raja_functor, N, 4, 10); // Test contention over a single atomic TimeAtomicOp(desul_functor, N); TimeAtomicOp(raja_functor, N); - }); + }); } -int main (int argc, char* argv[]) { - if (argc > 2) { - RAJA_ABORT_OR_THROW("Usage: ./benchmark-atomic.exe where N is the optional size of the benchmark loop"); - } - uint64_t N = 1000000000; - if (argc == 2) { - N = std::stoll(argv[1]); - } +int main(int argc, char* argv[]) +{ + if (argc > 2) + { + RAJA_ABORT_OR_THROW("Usage: ./benchmark-atomic.exe where N is the " + "optional size of the benchmark loop"); + } + uint64_t N = 1000000000; + if (argc == 2) + { + N = std::stoll(argv[1]); + } + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + // Perform an untimed initialization of both desul and RAJA atomics. + TimeAtomicOp, int, true>( + atomicWrapperDesulBinary {}, N, 10, + 1000, false); + TimeAtomicOp, int, true>( + AtomicAdd {}, N, 10, 1000, false); + // GPU benchmarks + std::cout << "Executing GPU benchmarks" << std::endl; + ExecuteBenchmark>( + N); +#endif + +#if defined(RAJA_ENABLE_OPENMP) + // Perform an untimed initialization of both desul and RAJA atomics. + TimeAtomicOp( + AtomicAdd {}, N, 10, 1000, false); + TimeAtomicOp( + atomicWrapperDesulBinary {}, N, 10, + 1000, false); + + // OpenMP benchmarks + std::cout << "Executing OpenMP benchmarks" << std::endl; + ExecuteBenchmark(N); +#endif - #if defined (RAJA_ENABLE_CUDA) || defined (RAJA_ENABLE_HIP) - // Perform an untimed initialization of both desul and RAJA atomics. - TimeAtomicOp, int, true>(atomicWrapperDesulBinary{}, N, 10, 1000, false); - TimeAtomicOp, int, true>(AtomicAdd{}, N, 10, 1000, false); - // GPU benchmarks - std::cout << "Executing GPU benchmarks" << std::endl; - ExecuteBenchmark>(N); - #endif - - #if defined (RAJA_ENABLE_OPENMP) - // Perform an untimed initialization of both desul and RAJA atomics. - TimeAtomicOp(AtomicAdd{}, N, 10, 1000, false); - TimeAtomicOp(atomicWrapperDesulBinary{}, N, 10, 1000, false); - - // OpenMP benchmarks - std::cout << "Executing OpenMP benchmarks" << std::endl; - ExecuteBenchmark(N); - #endif - - return 0; + return 0; } diff --git a/benchmark/host-device-lambda-benchmark.cpp b/benchmark/host-device-lambda-benchmark.cpp index 33c78e2d8c..dde4f02efb 100644 --- a/benchmark/host-device-lambda-benchmark.cpp +++ b/benchmark/host-device-lambda-benchmark.cpp @@ -17,13 +17,15 @@ static void benchmark_daxpy_raw(benchmark::State& state) double* a = new double[N]; double* b = new double[N]; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { a[i] = 1.0; b[i] = 2.0; } double c = 3.14159; - while (state.KeepRunning()) { + while (state.KeepRunning()) + { RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; }); } @@ -34,13 +36,15 @@ static void benchmark_daxpy_host(benchmark::State& state) double* a = new double[N]; double* b = new double[N]; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { a[i] = 1.0; b[i] = 2.0; } double c = 3.14159; - while (state.KeepRunning()) { + while (state.KeepRunning()) + { RAJA::forall(RAJA::RangeSegment(0, N), [=] __host__(int i) { a[i] += b[i] * c; }); } @@ -51,17 +55,18 @@ static void benchmark_daxpy_host_device(benchmark::State& state) double* a = new double[N]; double* b = new double[N]; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { a[i] = 1.0; b[i] = 2.0; } double c = 3.14159; - while (state.KeepRunning()) { + while (state.KeepRunning()) + { RAJA::forall(RAJA::RangeSegment(0, N), - [=] RAJA_HOST_DEVICE(int i) { - a[i] += b[i] * c; - }); + [=] RAJA_HOST_DEVICE(int i) + { a[i] += b[i] * c; }); } } diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp index 15a3e54eb8..b464b77396 100644 --- a/benchmark/ltimes.cpp +++ b/benchmark/ltimes.cpp @@ -7,46 +7,54 @@ // Place the following line before including RAJA to enable // statistics on the Vector abstractions -//#define RAJA_ENABLE_VECTOR_STATS +// #define RAJA_ENABLE_VECTOR_STATS // Un-comment the following line to run correctness checks on each variant -//#define DEBUG_LTIMES -//#define DEBUG_MATRIX_LOAD_STORE +// #define DEBUG_LTIMES +// #define DEBUG_MATRIX_LOAD_STORE #include "RAJA/config.hpp" -#define VARIANT_C 1 -#define VARIANT_C_VIEWS 1 -#define VARIANT_RAJA_SEQ 1 -#define VARIANT_RAJA_SEQ_ARGS 1 -#define VARIANT_RAJA_TEAMS_SEQ 1 -#define VARIANT_RAJA_VECTOR 1 -#define VARIANT_RAJA_MATRIX 1 -#define VARIANT_RAJA_SEQ_SHMEM 1 +#define VARIANT_C 1 +#define VARIANT_C_VIEWS 1 +#define VARIANT_RAJA_SEQ 1 +#define VARIANT_RAJA_SEQ_ARGS 1 +#define VARIANT_RAJA_TEAMS_SEQ 1 +#define VARIANT_RAJA_VECTOR 1 +#define VARIANT_RAJA_MATRIX 1 +#define VARIANT_RAJA_SEQ_SHMEM 1 #if defined(RAJA_ENABLE_OPENMP) -#define VARIANT_RAJA_OPENMP 1 +#define VARIANT_RAJA_OPENMP 1 #endif #if defined(RAJA_ENABLE_CUDA) -#define VARIANT_CUDA_KERNEL 1 -#define VARIANT_CUDA_TEAMS 1 -#define VARIANT_CUDA_TEAMS_MATRIX 1 -#define VARIANT_CUDA_KERNEL_SHMEM 1 +#define VARIANT_CUDA_KERNEL 1 +#define VARIANT_CUDA_TEAMS 1 +#define VARIANT_CUDA_TEAMS_MATRIX 1 +#define VARIANT_CUDA_KERNEL_SHMEM 1 #endif #if defined(RAJA_ENABLE_HIP) -#define RAJA_HIP_KERNEL 1 -#define RAJA_HIP_KERNEL_SHMEM 1 +#define RAJA_HIP_KERNEL 1 +#define RAJA_HIP_KERNEL_SHMEM 1 #endif - extern "C" { - void dgemm_(char * transa, char * transb, int * m, int * n, int * k, - double * alpha, double * A, int * lda, - double * B, int * ldb, double * beta, - double *, int * ldc); +void dgemm_(char* transa, + char* transb, + int* m, + int* n, + int* k, + double* alpha, + double* A, + int* lda, + double* B, + int* ldb, + double* beta, + double*, + int* ldc); } #include @@ -94,7 +102,6 @@ extern "C" { */ - using namespace RAJA; using namespace RAJA::expt; @@ -115,43 +122,43 @@ RAJA_INDEX_VALUE_T(IZ, int, "IZ"); // Function to check results // template -void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, +void checkResult(PHIVIEW_T& phi, + LVIEW_T& L, + PSIVIEW_T& psi, const int num_m, const int num_d, const int num_g, const int num_z); - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA LTIMES example...\n\n"; -//----------------------------------------------------------------------------// -// Define array dimensions, allocate arrays, define Layouts and Views, etc. + //----------------------------------------------------------------------------// + // Define array dimensions, allocate arrays, define Layouts and Views, etc. // Note: rand()/RAND_MAX is always zero, but forces the compiler to not // optimize out these values as compile time constants - const int num_m = 25 + (rand()/RAND_MAX); - const int num_g = 160 + (rand()/RAND_MAX); - const int num_d = 80 + (rand()/RAND_MAX); + const int num_m = 25 + (rand() / RAND_MAX); + const int num_g = 160 + (rand() / RAND_MAX); + const int num_d = 80 + (rand() / RAND_MAX); #ifdef DEBUG_LTIMES - const int num_iter = 1 ; //+ (rand()/RAND_MAX);; + const int num_iter = 1; //+ (rand()/RAND_MAX);; // use a decreased number of zones since this will take a lot longer // and we're not really measuring performance here - const long num_z = 32 + (rand()/RAND_MAX); + const long num_z = 32 + (rand() / RAND_MAX); #else - const int num_iter = 10 + (rand()/RAND_MAX); - const int num_z = 32*657 + (rand()/RAND_MAX); + const int num_iter = 10 + (rand() / RAND_MAX); + const int num_z = 32 * 657 + (rand() / RAND_MAX); #endif + double total_flops = 2.0 * num_g * num_z * num_d * num_m * num_iter * 1000.0; - double total_flops = 2.0*num_g*num_z*num_d*num_m*num_iter*1000.0; - - std::cout << "num_m = " << num_m << ", num_g = " << num_g << - ", num_d = " << num_d << ", num_z = " << num_z << "\n\n"; + std::cout << "num_m = " << num_m << ", num_g = " << num_g + << ", num_d = " << num_d << ", num_z = " << num_z << "\n\n"; std::cout << "total flops: " << (long)total_flops << "\n"; @@ -167,1133 +174,1076 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) double* psi_data = &psi_vec[0]; double* phi_data = &phi_vec[0]; - for (int i = 0; i < L_size; ++i) { - L_data[i] = i+1; + for (int i = 0; i < L_size; ++i) + { + L_data[i] = i + 1; } - for (int i = 0; i < psi_size; ++i) { - psi_data[i] = 2*i+1; + for (int i = 0; i < psi_size; ++i) + { + psi_data[i] = 2 * i + 1; } // Note phi_data will be set to zero before each variant is run. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_C -{ - std::cout << "\n Running baseline C-version of LTimes...\n"; + { + std::cout << "\n Running baseline C-version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // Using restrict doesn't make much of a difference for most compilers. + // Using restrict doesn't make much of a difference for most compilers. #if 1 - double * RAJA_RESTRICT L = L_data; - double * RAJA_RESTRICT psi = psi_data; - double * RAJA_RESTRICT phi = phi_data; + double* RAJA_RESTRICT L = L_data; + double* RAJA_RESTRICT psi = psi_data; + double* RAJA_RESTRICT phi = phi_data; #else - double * L = L_data; - double * psi = psi_data; - double * phi = phi_data; + double* L = L_data; + double* psi = psi_data; + double* phi = phi_data; #endif - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter) - for (int g = 0; g < num_g; ++g) { - for (int z = 0; z < num_z; ++z) { - for (int m = 0; m < num_m; ++m) { - for (int d = 0; d < num_d; ++d) { - phi[g*num_z*num_m + z*num_m + m] += - L[d*num_m + m] * psi[g*num_z*num_d + z*num_d + d]; + for (int iter = 0; iter < num_iter; ++iter) + for (int g = 0; g < num_g; ++g) + { + for (int z = 0; z < num_z; ++z) + { + for (int m = 0; m < num_m; ++m) + { + for (int d = 0; d < num_d; ++d) + { + phi[g * num_z * num_m + z * num_m + m] += + L[d * num_m + m] * psi[g * num_z * num_d + z * num_d + d]; + } + } } } - } - } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " C-version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; - -} + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " C-version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_C_VIEWS -{ - std::cout << "\n Running C-version of LTimes (with Views)...\n"; + { + std::cout << "\n Running C-version of LTimes (with Views)...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter) - for (IG g(0); g < num_g; ++g) { - for (IZ z(0); z < num_z; ++z) { - for (IM m(0); m < num_m; ++m) { - for (ID d(0); d < num_d; ++d) { - phi(m, g, z) += L(m, d) * psi(d, g, z); + for (int iter = 0; iter < num_iter; ++iter) + for (IG g(0); g < num_g; ++g) + { + for (IZ z(0); z < num_z; ++z) + { + for (IM m(0); m < num_m; ++m) + { + for (ID d(0); d < num_d; ++d) + { + phi(m, g, z) += L(m, d) * psi(d, g, z); + } + } } } - } - } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " C-version of LTimes run time (with Views) (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " C-version of LTimes run time (with Views) (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_SEQ -{ - std::cout << "\n Running RAJA sequential version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::For<2, seq_exec, // g - statement::For<3, seq_exec, // z - statement::For<0, seq_exec, // m - statement::For<1, simd_exec, // d - statement::Lambda<0> - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA sequential version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA sequential version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + using EXECPOL = RAJA::KernelPolicy>>>>>; + + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); + + RAJA::Timer timer; + timer.start(); + + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=](IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); + + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA sequential version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_SEQ_ARGS -{ - std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::For<2, seq_exec, // g - statement::For<3, seq_exec, // z - statement::For<0, seq_exec, // m - statement::For<1, simd_exec, // d - statement::Lambda<0, Segs<0, 1, 2, 3>> - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n"; - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA sequential ARGS version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + std::memset(phi_data, 0, phi_size * sizeof(double)); + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; -#if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); -#endif -} -#endif + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; -//----------------------------------------------------------------------------// + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; -#if VARIANT_RAJA_TEAMS_SEQ -{ - std::cout << "\n Running RAJA Teams sequential version of LTimes...\n"; + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + using EXECPOL = RAJA::KernelPolicy>>>>>>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); + RAJA::Timer timer; + timer.start(); - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=](IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA sequential ARGS version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); +#if defined(DEBUG_LTIMES) + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); +#endif + } +#endif - using pol_launch = RAJA::LaunchPolicy; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + //----------------------------------------------------------------------------// +#if VARIANT_RAJA_TEAMS_SEQ + { + std::cout << "\n Running RAJA Teams sequential version of LTimes...\n"; + std::memset(phi_data, 0, phi_size * sizeof(double)); - RAJA::Timer timer; - timer.start(); + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::launch(RAJA::ExecPlace::HOST, RAJA::LaunchParams(), [=](RAJA::LaunchContext ctx){ + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_z), [&](IZ z){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_m), [&](IM m){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_d), [&](ID d){ - phi(m, g, z) += L(m, d) * psi(d, g, z); - }); - }); - }); - }); + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - }); // laucnch - } // iter - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA Teams sequential version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); -#if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); -#endif + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); -} + using pol_launch = RAJA::LaunchPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; + + + RAJA::Timer timer; + timer.start(); + + for (int iter = 0; iter < num_iter; ++iter) + { + RAJA::launch( + RAJA::ExecPlace::HOST, RAJA::LaunchParams(), + [=](RAJA::LaunchContext ctx) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_g), + [&](IG g) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_z), + [&](IZ z) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_m), + [&](IM m) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_d), + [&](ID d) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); + }); + }); + }); + }); // laucnch + } // iter + + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA Teams sequential version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; + + +#if defined(DEBUG_LTIMES) + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); +#endif + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_VECTOR -{ - std::cout << "\n Running RAJA vectorized version of LTimes...\n"; + { + std::cout << "\n Running RAJA vectorized version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 0, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 0, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 0, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 0, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using vector_t = RAJA::expt::VectorRegister; - using VecIZ = RAJA::expt::VectorIndex; + using vector_t = RAJA::expt::VectorRegister; + using VecIZ = RAJA::expt::VectorIndex; - using EXECPOL = - RAJA::KernelPolicy< - statement::For<2, seq_exec, // g - statement::For<0, seq_exec, // m - statement::For<1, seq_exec, // d + using EXECPOL = RAJA::KernelPolicy< + statement::For<2, seq_exec, // g + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - > - >; + statement::Lambda<0>>>>>; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::expt::tensor_stats::resetVectorStats(); + RAJA::expt::tensor_stats::resetVectorStats(); #endif - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g)); + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g)); - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - auto all_z = VecIZ::all(); + auto all_z = VecIZ::all(); - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g) { - phi(m, g, all_z) += L(m, d) * psi(d, g, all_z); - } - ); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, + [=](IM m, ID d, IG g) { + phi(m, g, all_z) += L(m, d) * psi(d, g, all_z); + }); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA vectorized version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA vectorized version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::printVectorStats(); + RAJA::tensor_stats::printVectorStats(); #endif #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_MATRIX -{ - std::cout << "\n Running RAJA column-major matrix version of LTimes...\n"; + { + std::cout << "\n Running RAJA column-major matrix version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 2, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 2, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 2, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 2, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::expt::SquareMatrixRegister; - //using matrix_t = RAJA::expt::SquareMatrixRegister; -// using matrix_t = RAJA::expt::RectMatrixRegister; + using matrix_t = + RAJA::expt::SquareMatrixRegister; + // using matrix_t = RAJA::expt::SquareMatrixRegister; + // using matrix_t = RAJA::expt::RectMatrixRegister; + std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x" + << matrix_t::s_dim_elem(1) << std::endl; - std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << - "x" << matrix_t::s_dim_elem(1) << std::endl; + printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers); - printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers); - - using RowM = RAJA::expt::RowIndex; - using ColD = RAJA::expt::ColIndex; - using ColZ = RAJA::expt::ColIndex; + using RowM = RAJA::expt::RowIndex; + using ColD = RAJA::expt::ColIndex; + using ColZ = RAJA::expt::ColIndex; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::resetVectorStats(); + RAJA::tensor_stats::resetVectorStats(); #endif - RAJA::Timer timer; - timer.start(); - + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), - [=](IG g) + for (int iter = 0; iter < num_iter; ++iter) { - auto rows_m = RowM::all(); - auto cols_z = ColZ::all(); - auto cols_d = ColD::all(); - auto rows_d = toRowIndex(cols_d); - - phi(rows_m, g, cols_z) += - L(rows_m, cols_d) * psi(rows_d, g, cols_z); - -// phi(rows_m, g, cols_z) = (L(rows_m, cols_d) * psi(rows_d, g, cols_z)) * (L(rows_m, cols_d) * psi(rows_d, g, cols_z)); - - }); - - - - } + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + [=](IG g) + { + auto rows_m = RowM::all(); + auto cols_z = ColZ::all(); + auto cols_d = ColD::all(); + auto rows_d = toRowIndex(cols_d); + + phi(rows_m, g, cols_z) += + L(rows_m, cols_d) * psi(rows_d, g, cols_z); + + // phi(rows_m, g, cols_z) = (L(rows_m, + // cols_d) * psi(rows_d, g, cols_z)) * + // (L(rows_m, cols_d) * psi(rows_d, g, + // cols_z)); + }); + } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA column-major matrix version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout + << " RAJA column-major matrix version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::printVectorStats(); + RAJA::tensor_stats::printVectorStats(); #endif #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - - -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_MATRIX -{ - std::cout << "\n Running RAJA row-major matrix version of LTimes...\n"; + { + std::cout << "\n Running RAJA row-major matrix version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 0, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 0, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 0, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 0, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::expt::SquareMatrixRegister; + using matrix_t = RAJA::expt::SquareMatrixRegister; - std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << - "x" << matrix_t::s_dim_elem(1) << std::endl; + std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x" + << matrix_t::s_dim_elem(1) << std::endl; using RowM = RAJA::expt::RowIndex; using ColD = RAJA::expt::ColIndex; using ColZ = RAJA::expt::ColIndex; - #ifdef RAJA_ENABLE_VECTOR_STATS +#ifdef RAJA_ENABLE_VECTOR_STATS RAJA::expt::tensor_stats::resetVectorStats(); - #endif +#endif RAJA::Timer timer; timer.start(); - for (int iter = 0;iter < num_iter;++ iter){ - - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), - [=](IG g) - { - - auto rows_m = RowM::all(); - auto cols_z = ColZ::all(); - auto cols_d = ColD::all(); - auto rows_d = toRowIndex(cols_d); - - phi(rows_m, g, cols_z) += - L(rows_m, cols_d) * psi(rows_d, g, cols_z); - - }); - - + for (int iter = 0; iter < num_iter; ++iter) + { + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + [=](IG g) + { + auto rows_m = RowM::all(); + auto cols_z = ColZ::all(); + auto cols_d = ColD::all(); + auto rows_d = toRowIndex(cols_d); + + phi(rows_m, g, cols_z) += + L(rows_m, cols_d) * psi(rows_d, g, cols_z); + }); } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA row-major matrix version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA row-major matrix version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::printVectorStats(); + RAJA::tensor_stats::printVectorStats(); #endif #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - - -} + } #endif //----------------------------------------------------------------------------// #if VARIANT_RAJA_SEQ_SHMEM -{ - std::cout << "\n Running RAJA sequential shmem version of LTimes...\n"; + { + std::cout << "\n Running RAJA sequential shmem version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + std::memset(phi_data, 0, phi_size * sizeof(double)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{0, 1, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{0, 1, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{0, 1, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{0, 1, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - constexpr size_t tile_m = 25; - constexpr size_t tile_d = 80; - constexpr size_t tile_z = 256; - constexpr size_t tile_g = 0; + constexpr size_t tile_m = 25; + constexpr size_t tile_d = 80; + constexpr size_t tile_z = 256; + constexpr size_t tile_g = 0; - using RAJA::statement::Param; + using RAJA::statement::Param; - using EXECPOL = - RAJA::KernelPolicy< + using EXECPOL = RAJA::KernelPolicy< - // Create memory tiles - statement::InitLocalMem, + // Create memory tiles + statement::InitLocalMem< + RAJA::cpu_tile_mem, RAJA::ParamList<0, 1, 2>, - // Tile outer m,d loops - statement::Tile<0, tile_fixed, seq_exec, // m - statement::Tile<1, tile_fixed, seq_exec, // d + // Tile outer m,d loops + statement::Tile< + 0, tile_fixed, seq_exec, // m + statement::Tile< + 1, tile_fixed, seq_exec, // d - // Load L(m,d) for m,d tile into shmem - statement::For<0, seq_exec, // m - statement::For<1, seq_exec, // d - statement::Lambda<0, Segs<0, 1>, - Params<0>, - Offsets<0, 1>> - > - >, + // Load L(m,d) for m,d tile into shmem + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d + statement::Lambda< + 0, Segs<0, 1>, Params<0>, + Offsets<0, 1>>>>, - // Run inner g, z loops with z loop tiled - statement::For<2, seq_exec, // g - statement::Tile<3, tile_fixed, seq_exec, // z + // Run inner g, z loops with z loop tiled + statement::For< + 2, seq_exec, // g + statement::Tile<3, tile_fixed, seq_exec, // z - // Load psi into shmem - statement::For<1, seq_exec, // d - statement::For<3, seq_exec, // z - statement::Lambda<1, Segs<1, 2, 3>, - Params<1>, - Offsets<1, 2, 3>> - > - >, + // Load psi into shmem + statement::For< + 1, seq_exec, // d + statement::For< + 3, seq_exec, // z + statement::Lambda< + 1, Segs<1, 2, 3>, Params<1>, + Offsets<1, 2, 3>>>>, - // Compute phi - statement::For<0, seq_exec, // m + // Compute phi + statement::For< + 0, seq_exec, // m - // Load phi into shmem - statement::For<3, seq_exec, // z - statement::Lambda<2, Segs<0, 2, 3>, - Params<2>, - Offsets<0, 2, 3>> - >, + // Load phi into shmem + statement::For< + 3, seq_exec, // z + statement::Lambda< + 2, Segs<0, 2, 3>, Params<2>, + Offsets<0, 2, 3>>>, - // Compute phi in shmem - statement::For<1, seq_exec, // d - statement::For<3, seq_exec, // z - statement::Lambda<3, Params<0, 1, 2>, - Offsets<0, 1, 2, 3>> - > - >, + // Compute phi in shmem + statement::For< + 1, seq_exec, // d + statement::For< + 3, seq_exec, // z + statement::Lambda< + 3, Params<0, 1, 2>, + Offsets<0, 1, 2, 3>>>>, - // Store phi - statement:: For<3, seq_exec, // z - statement::Lambda<4, Segs<0, 2, 3>, - Params<2>, - Offsets<0, 2, 3>> - > - > // m + // Store phi + statement::For< + 3, seq_exec, // z + statement::Lambda< + 4, Segs<0, 2, 3>, Params<2>, + Offsets<0, 2, 3>>>> // m - > // Tile z - > // g + > // Tile z + > // g - > // Tile d - > // Tile m - > // LocalMemory - >; // KernelPolicy + > // Tile d + > // Tile m + > // LocalMemory + >; // KernelPolicy + // + // Define statically dimensioned local arrays used in kernel + // - // - // Define statically dimensioned local arrays used in kernel - // + using shmem_L_t = + RAJA::TypedLocalArray, IM, ID>; + shmem_L_t shmem_L; - using shmem_L_t = RAJA::TypedLocalArray, - IM, ID>; - shmem_L_t shmem_L; + using shmem_psi_t = + RAJA::TypedLocalArray, ID, IG, + IZ>; + shmem_psi_t shmem_psi; - using shmem_psi_t = RAJA::TypedLocalArray, - ID, IG, IZ>; - shmem_psi_t shmem_psi; + using shmem_phi_t = + RAJA::TypedLocalArray, IM, IG, + IZ>; + shmem_phi_t shmem_phi; - using shmem_phi_t = RAJA::TypedLocalArray, - IM, IG, IZ>; - shmem_phi_t shmem_phi; + RAJA::Timer timer; + timer.start(); - RAJA::Timer timer; - timer.start(); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel_param( - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)), + // For kernel_param, second arg is a tuple of data objects used in + // lambdas. They are the last args in all lambdas (after indices). + RAJA::make_tuple(shmem_L, shmem_psi, shmem_phi), - RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)), - // For kernel_param, second arg is a tuple of data objects used in lambdas. - // They are the last args in all lambdas (after indices). - RAJA::make_tuple( shmem_L, - shmem_psi, - shmem_phi), - - - // Lambda<0> : Load L into shmem - [=] (IM m, ID d, - shmem_L_t& sh_L, - IM tm, ID td) - { - sh_L(tm, td) = L(m, d); - }, - // Lambda<1> : Load psi into shmem - [=] (ID d, IG g, IZ z, - shmem_psi_t& sh_psi, - ID td, IG tg, IZ tz) - { - sh_psi(td, tg, tz) = psi(d, g, z); - }, + // Lambda<0> : Load L into shmem + [=](IM m, ID d, shmem_L_t& sh_L, IM tm, ID td) + { sh_L(tm, td) = L(m, d); }, - // Lambda<2> : Load phi into shmem - [=] (IM m, IG g, IZ z, - shmem_phi_t& sh_phi, - IM tm, IG tg, IZ tz) - { - sh_phi(tm, tg, tz) = phi(m, g, z); - }, + // Lambda<1> : Load psi into shmem + [=](ID d, IG g, IZ z, shmem_psi_t& sh_psi, ID td, IG tg, IZ tz) + { sh_psi(td, tg, tz) = psi(d, g, z); }, - // Lambda<3> : Compute phi in shmem - [=] (shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi, - IM tm, ID td, IG tg, IZ tz) - { - sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz); - }, + // Lambda<2> : Load phi into shmem + [=](IM m, IG g, IZ z, shmem_phi_t& sh_phi, IM tm, IG tg, IZ tz) + { sh_phi(tm, tg, tz) = phi(m, g, z); }, - // Lambda<4> : Store phi - [=] (IM m, IG g, IZ z, - shmem_phi_t& sh_phi, - IM tm, IG tg, IZ tz) - { - phi(m, g, z) = sh_phi(tm, tg, tz); - } + // Lambda<3> : Compute phi in shmem + [=](shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi, IM tm, + ID td, IG tg, IZ tz) + { sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz); }, - ); + // Lambda<4> : Store phi + [=](IM m, IG g, IZ z, shmem_phi_t& sh_phi, IM tm, IG tg, IZ tz) + { phi(m, g, z) = sh_phi(tm, tg, tz); } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA sequential shmem version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + ); + + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA sequential shmem version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) && (VARIANT_RAJA_OPENMP) -{ - std::cout << "\n Running RAJA OpenMP version of LTimes...\n"; + { + std::cout << "\n Running RAJA OpenMP version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{0, 1, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{0, 1, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{0, 1, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{0, 1, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); #if 1 - using EXECPOL = - RAJA::KernelPolicy< - statement::For<0, omp_parallel_for_exec, // m - statement::For<1, seq_exec, // d - statement::For<2, seq_exec, // g - statement::For<3, simd_exec, // z - statement::Lambda<0> - > - > - > - > - >; + using EXECPOL = RAJA::KernelPolicy>>>>>; #else - // - // Benefits of using OpenMP collapse depends on compiler, platform, - // relative segment sizes. - // - using EXECPOL = - RAJA::KernelPolicy< - statement::Collapse, // m, g, z - statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - >; + // + // Benefits of using OpenMP collapse depends on compiler, platform, + // relative segment sizes. + // + using EXECPOL = RAJA::KernelPolicy, // m, g, z + statement::For<1, seq_exec, // d + statement::Lambda<0>>>>; #endif - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=](IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA OpenMP version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA OpenMP version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_CUDA_KERNEL -{ - std::cout << "\n Running RAJA CUDA version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::CudaKernelAsync< - statement::For<0, cuda_block_x_loop, // m - statement::For<2, cuda_block_y_loop, // g - statement::For<3, cuda_thread_x_loop, // z - statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA CUDA version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + using EXECPOL = + RAJA::KernelPolicy>>>>>>; + + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); + + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - cudaErrchk( cudaDeviceSynchronize() ); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); + cudaErrchk(cudaDeviceSynchronize()); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); + + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif - -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_CUDA_TEAMS -{ - std::cout << "\n Running RAJA CUDA Teams version of LTimes...\n"; + { + std::cout << "\n Running RAJA CUDA Teams version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); - using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + using pol_launch = + RAJA::LaunchPolicy>; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::launch( - RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(160, 1, 1), - RAJA::Threads(8, 64, 1)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) + for (int iter = 0; iter < num_iter; ++iter) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_z), [&](IZ z){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_m), [&](IM m){ - - double acc = phi(m, g, z); - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_d), [&](ID d){ - - acc += L(m, d) * psi(d, g, z); - - - }); - - phi(m,g,z) = acc; + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(160, 1, 1), RAJA::Threads(8, 64, 1)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_g), + [&](IG g) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_z), + [&](IZ z) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_m), + [&](IM m) + { + double acc = phi(m, g, z); + + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_d), + [&](ID d) + { + acc += L(m, d) * psi(d, g, z); + }); + + phi(m, g, z) = acc; + }); + }); + }); }); - }); - }); - - }); - - } - cudaErrchk( cudaDeviceSynchronize() ); + } + cudaErrchk(cudaDeviceSynchronize()); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA Teams version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA Teams version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #ifdef __CUDA_ARCH__ #define RAJA_GET_POLICY(POL) typename POL::device_policy_t @@ -1303,732 +1253,711 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if VARIANT_CUDA_TEAMS_MATRIX -{ - std::cout << "\n Running RAJA CUDA Teams+Matrix version of LTimes...\n"; + { + std::cout << "\n Running RAJA CUDA Teams+Matrix version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); - using matrix_layout = RowMajorLayout; + using matrix_layout = RowMajorLayout; - using L_matrix_host_t = RAJA::expt::SquareMatrixRegister; - using L_matrix_device_t = RAJA::expt::RectMatrixRegister; - using L_matrix_hd_t = RAJA::LaunchPolicy; + using L_matrix_host_t = + RAJA::expt::SquareMatrixRegister; + using L_matrix_device_t = + RAJA::expt::RectMatrixRegister; + using L_matrix_hd_t = + RAJA::LaunchPolicy; - using phi_matrix_host_t = RAJA::expt::SquareMatrixRegister; - using phi_matrix_device_t = RAJA::expt::RectMatrixRegister; - using phi_matrix_hd_t = RAJA::LaunchPolicy; + using phi_matrix_host_t = + RAJA::expt::SquareMatrixRegister; + using phi_matrix_device_t = + RAJA::expt::RectMatrixRegister; + using phi_matrix_hd_t = + RAJA::LaunchPolicy; - using psi_matrix_host_t = RAJA::expt::SquareMatrixRegister; - using psi_matrix_device_t = RAJA::expt::RectMatrixRegister; - using psi_matrix_hd_t = RAJA::LaunchPolicy; + using psi_matrix_host_t = + RAJA::expt::SquareMatrixRegister; + using psi_matrix_device_t = + RAJA::expt::RectMatrixRegister; + using psi_matrix_hd_t = + RAJA::LaunchPolicy; - using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; + using pol_launch = + RAJA::LaunchPolicy>; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 2, 0}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 2, 0}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 2, 0}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 2, 0}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - auto seg_g = RAJA::TypedRangeSegment(0, num_g); - auto seg_z = RAJA::TypedRangeSegment(0, num_z); - auto seg_m = RAJA::TypedRangeSegment(0, num_m); - auto seg_d = RAJA::TypedRangeSegment(0, num_d); + auto seg_g = RAJA::TypedRangeSegment(0, num_g); + auto seg_z = RAJA::TypedRangeSegment(0, num_z); + auto seg_m = RAJA::TypedRangeSegment(0, num_m); + auto seg_d = RAJA::TypedRangeSegment(0, num_d); - printf("num_iter=%d\n", (int)num_iter); - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::launch( - RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1), - RAJA::Threads(32, 32, 1)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) + printf("num_iter=%d\n", (int)num_iter); + for (int iter = 0; iter < num_iter; ++iter) { + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1), + RAJA::Threads(32, 32, 1)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) + { + using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t); + using L_RowM = RAJA::expt::RowIndex; + using L_ColD = RAJA::expt::ColIndex; + + using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t); + using psi_RowD = RAJA::expt::RowIndex; + using psi_ColZ = RAJA::expt::ColIndex; + + using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t); + using phi_RowM = RAJA::expt::RowIndex; + using phi_ColZ = RAJA::expt::ColIndex; + + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), + [&](IG g) + { + RAJA::tile( + ctx, 32, + RAJA::TypedRangeSegment(0, num_z), + [&](RAJA::TypedRangeSegment tzi) + { + RAJA::TypedRangeSegment tz( + *tzi.begin(), *tzi.end()); + + phi(phi_RowM::all(), g, phi_ColZ(tz)) += + L(L_RowM::all(), L_ColD::all()) * + psi(psi_RowD::all(), g, psi_ColZ(tz)); + }); + }); + }); + } + cudaErrchk(cudaDeviceSynchronize()); - - using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t); - using L_RowM = RAJA::expt::RowIndex; - using L_ColD = RAJA::expt::ColIndex; - - using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t); - using psi_RowD = RAJA::expt::RowIndex; - using psi_ColZ = RAJA::expt::ColIndex; - - using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t); - using phi_RowM = RAJA::expt::RowIndex; - using phi_ColZ = RAJA::expt::ColIndex; - - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ - - RAJA::tile(ctx, 32, RAJA::TypedRangeSegment(0, num_z), [&](RAJA::TypedRangeSegment tzi){ - - RAJA::TypedRangeSegment tz(*tzi.begin(), *tzi.end()); - - phi(phi_RowM::all(), g, phi_ColZ(tz)) += - L(L_RowM::all(), L_ColD::all()) * psi(psi_RowD::all(), g, psi_ColZ(tz)); - - }); - }); - - }); - - } - cudaErrchk( cudaDeviceSynchronize() ); - - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA Teams+Matrix version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA Teams+Matrix version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_CUDA_KERNEL_SHMEM -{ - std::cout << "\n Running RAJA CUDA + shmem version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); + { + std::cout << "\n Running RAJA CUDA + shmem version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); + + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + + static const int tile_m = 25; + static const int tile_d = 90; + static const int tile_g = 0; + static const int tile_z = 40; + + + // + // Define statically dimensioned local arrays used in kernel + // + + using shmem_L_t = + RAJA::TypedLocalArray, IM, ID>; + shmem_L_t shmem_L; + + + using shmem_psi_t = + RAJA::TypedLocalArray, ID, IG, + IZ>; + shmem_psi_t shmem_psi; + + + // + // Define our execution policy + // + + using RAJA::Offsets; + using RAJA::Params; + using RAJA::Segs; + + using EXECPOL = + RAJA::KernelPolicy, + // Tile outer m,d loops + statement::Tile< + 0, tile_fixed, seq_exec, // m + statement::Tile< + 1, tile_fixed, seq_exec, // d + + // Load L for m,d tile into shmem + statement::For<1, cuda_thread_x_loop, // d + statement::For<0, cuda_thread_y_direct, // m + statement::Lambda< + 0, Segs<0, 1>, Params<0>, + Offsets<0, 1>>>>, + statement::CudaSyncThreads, + + // Distribute g, z across blocks and tile z + statement::For< + 2, cuda_block_y_loop, // g + statement::Tile< + 3, tile_fixed, + cuda_block_x_loop, // z + + // Load phi into thread local storage + statement::For< + 3, cuda_thread_x_direct, // z + statement::For< + 0, cuda_thread_y_direct, // m + statement::Lambda<2, Segs<0, 2, 3>, + Params<2>>>>, + + // Load slice of psi into shmem + statement::For< + 3, cuda_thread_x_direct, // z + statement::For<1, cuda_thread_y_loop, // d + // (reusing + // y) + statement::Lambda< + 1, Segs<1, 2, 3>, Params<1>, + Offsets<1, 2, 3>>>>, + statement::CudaSyncThreads, + + // Compute phi + statement::For< + 3, cuda_thread_x_direct, // z + statement::For< + 0, cuda_thread_y_direct, // m + + // Compute thread-local Phi value and store + statement::For< + 1, seq_exec, // d + statement::Lambda< + 3, Segs<0, 1, 2, 3>, + Params<0, 1, 2>, + Offsets<0, 1, 2, 3>>> // d + > // m + >, // z + + // finish tile over directions + statement::CudaSyncThreads, + + // Write out phi from thread local storage + statement::For< + 3, cuda_thread_x_direct, // z + statement::For< + 0, cuda_thread_y_direct, // m + statement::Lambda<4, Segs<0, 2, 3>, + Params<2>>>>, + statement::CudaSyncThreads + + > // Tile z + > // g + + > // Tile d + > // Tile m + > // init shmem + > // CudaKernelAsync + + >; // KernelPolicy - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - - static const int tile_m = 25; - static const int tile_d = 90; - static const int tile_g = 0; - static const int tile_z = 40; - - - - - // - // Define statically dimensioned local arrays used in kernel - // - - using shmem_L_t = RAJA::TypedLocalArray, - IM, ID>; - shmem_L_t shmem_L; - - - using shmem_psi_t = RAJA::TypedLocalArray, - ID, IG, IZ>; - shmem_psi_t shmem_psi; - - - - // - // Define our execution policy - // - - using RAJA::Segs; - using RAJA::Params; - using RAJA::Offsets; - - using EXECPOL = - RAJA::KernelPolicy< - statement::CudaKernelAsync< - statement::InitLocalMem, - // Tile outer m,d loops - statement::Tile<0, tile_fixed, seq_exec, // m - statement::Tile<1, tile_fixed, seq_exec, // d - - // Load L for m,d tile into shmem - statement::For<1, cuda_thread_x_loop, // d - statement::For<0, cuda_thread_y_direct, // m - statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> - > - >, - statement::CudaSyncThreads, - - // Distribute g, z across blocks and tile z - statement::For<2, cuda_block_y_loop, // g - statement::Tile<3, tile_fixed, cuda_block_x_loop, // z - - // Load phi into thread local storage - statement::For<3, cuda_thread_x_direct, // z - statement::For<0, cuda_thread_y_direct, // m - statement::Lambda<2, Segs<0,2,3>, Params<2>> - > - >, - - // Load slice of psi into shmem - statement::For<3,cuda_thread_x_direct, // z - statement::For<1, cuda_thread_y_loop, // d (reusing y) - statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>> - > - >, - statement::CudaSyncThreads, - - // Compute phi - statement::For<3, cuda_thread_x_direct, // z - statement::For<0, cuda_thread_y_direct, // m - - // Compute thread-local Phi value and store - statement::For<1, seq_exec, // d - statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>> - > // d - > // m - >, // z - - // finish tile over directions - statement::CudaSyncThreads, - - // Write out phi from thread local storage - statement::For<3, cuda_thread_x_direct, // z - statement::For<0, cuda_thread_y_direct, // m - statement::Lambda<4, Segs<0,2,3>, Params<2>> - > - >, - statement::CudaSyncThreads + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - > // Tile z - > // g + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)), - > // Tile d - > // Tile m - > // init shmem - > // CudaKernelAsync - - >; // KernelPolicy + // For kernel_param, second arg is a tuple of data objects used in + // lambdas. They are the last args in all lambdas (after indices). + // Here, the last entry '0.0' yields a thread-private temporary for + // computing a phi value, for shared memory before writing to phi + // array. + RAJA::make_tuple(shmem_L, shmem_psi, 0.0), + // Lambda<0> : Load L into shmem + [=] RAJA_DEVICE(IM m, ID d, shmem_L_t & sh_L, IM tm, ID td) + { sh_L(tm, td) = L(m, d); }, + // Lambda<1> : Load slice of psi into shmem + [=] RAJA_DEVICE(ID d, IG g, IZ z, shmem_psi_t & sh_psi, ID td, IG tg, + IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); }, + // Lambda<2> : Load thread-local phi value + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi_local = phi(m, g, z); }, + // Lambda<3> Compute thread-local phi value + [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z, shmem_L_t & sh_L, + shmem_psi_t & sh_psi, double& phi_local, IM tm, ID td, + IG tg, IZ tz) + { phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); }, - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel_param( - RAJA::make_tuple( - RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)), - - // For kernel_param, second arg is a tuple of data objects used in lambdas. - // They are the last args in all lambdas (after indices). - // Here, the last entry '0.0' yields a thread-private temporary for - // computing a phi value, for shared memory before writing to phi array. - RAJA::make_tuple( shmem_L, - shmem_psi, - 0.0), - - // Lambda<0> : Load L into shmem - [=] RAJA_DEVICE (IM m, ID d, - shmem_L_t& sh_L, - IM tm, ID td) { - sh_L(tm, td) = L(m, d); - }, - - // Lambda<1> : Load slice of psi into shmem - [=] RAJA_DEVICE (ID d, IG g, IZ z, - shmem_psi_t& sh_psi, - ID td, IG tg, IZ tz) { - - sh_psi(td, tg, tz) = psi(d, g, z); - }, - - // Lambda<2> : Load thread-local phi value - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi_local = phi(m, g, z); - }, - - // Lambda<3> Compute thread-local phi value - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z, - shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local, - IM tm, ID td, IG tg, IZ tz) { - - phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); - }, - - // Lambda<4> : Store phi - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi(m, g, z) = phi_local; - } - - ); + // Lambda<4> : Store phi + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi(m, g, z) = phi_local; } - cudaDeviceSynchronize(); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA + shmem version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + ); + cudaDeviceSynchronize(); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA + shmem version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); -} + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if RAJA_HIP_KERNEL -{ - std::cout << "\n Running RAJA HIP version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - hipErrchk( hipMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dL_data, L_data, L_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::HipKernelAsync< - statement::For<0, hip_block_x_loop, // m - statement::For<2, hip_block_y_loop, // g - statement::For<3, hip_thread_x_loop, // z - statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - hipErrchk( hipDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA HIP version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + hipErrchk(hipMalloc((void**)&dL_data, L_size * sizeof(double))); + hipErrchk(hipMemcpy(dL_data, L_data, L_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + hipErrchk(hipMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dphi_data, phi_size * sizeof(double))); + hipErrchk(hipMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + hipMemcpyHostToDevice)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + using EXECPOL = RAJA::KernelPolicy>>>>>>; + + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); - hipErrchk( hipDeviceSynchronize() ); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA HIP version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + RAJA::Timer timer; + hipErrchk(hipDeviceSynchronize()); + timer.start(); + + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); - hipErrchk( hipMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - hipMemcpyDeviceToHost ) ); + hipErrchk(hipDeviceSynchronize()); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA HIP version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - hipErrchk( hipFree( dL_data ) ); - hipErrchk( hipFree( dpsi_data ) ); - hipErrchk( hipFree( dphi_data ) ); + hipErrchk(hipMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + hipMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + hipErrchk(hipFree(dL_data)); + hipErrchk(hipFree(dpsi_data)); + hipErrchk(hipFree(dphi_data)); + + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if RAJA_HIP_KERNEL_SHMEM -{ - std::cout << "\n Running RAJA HIP + shmem version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - hipErrchk( hipMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dL_data, L_data, L_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - - static const int tile_m = 25; - static const int tile_d = 90; - static const int tile_g = 0; - static const int tile_z = 40; - - - + { + std::cout << "\n Running RAJA HIP + shmem version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + hipErrchk(hipMalloc((void**)&dL_data, L_size * sizeof(double))); + hipErrchk(hipMemcpy(dL_data, L_data, L_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + hipErrchk(hipMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dphi_data, phi_size * sizeof(double))); + hipErrchk(hipMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + hipMemcpyHostToDevice)); + + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + + static const int tile_m = 25; + static const int tile_d = 90; + static const int tile_g = 0; + static const int tile_z = 40; + + + // + // Define statically dimensioned local arrays used in kernel + // + + using shmem_L_t = + RAJA::TypedLocalArray, IM, ID>; + shmem_L_t shmem_L; + + + using shmem_psi_t = + RAJA::TypedLocalArray, ID, IG, + IZ>; + shmem_psi_t shmem_psi; + + + // + // Define our execution policy + // + + using RAJA::Offsets; + using RAJA::Params; + using RAJA::Segs; + using RAJA::statement::Param; + + using EXECPOL = + RAJA::KernelPolicy, + // Tile outer m,d loops + statement::Tile< + 0, tile_fixed, seq_exec, // m + statement::Tile< + 1, tile_fixed, seq_exec, // d + + // Load L for m,d tile into shmem + statement::For<1, hip_thread_x_loop, // d + statement::For<0, hip_thread_y_direct, // m + statement::Lambda< + 0, Segs<0, 1>, Params<0>, + Offsets<0, 1>>>>, + statement::HipSyncThreads, + + // Distribute g, z across blocks and tile z + statement::For< + 2, hip_block_y_loop, // g + statement::Tile< + 3, tile_fixed, + hip_block_x_loop, // z + + // Load phi into thread local storage + statement::For< + 3, hip_thread_x_direct, // z + statement::For< + 0, hip_thread_y_direct, // m + statement::Lambda<2, Segs<0, 2, 3>, + Params<2>>>>, + + // Load slice of psi into shmem + statement::For< + 3, hip_thread_x_direct, // z + statement::For<1, hip_thread_y_loop, // d + // (reusing + // y) + statement::Lambda< + 1, Segs<1, 2, 3>, Params<1>, + Offsets<1, 2, 3>>>>, + statement::HipSyncThreads, + + // Compute phi + statement::For< + 3, hip_thread_x_direct, // z + statement::For< + 0, hip_thread_y_direct, // m + + // Compute thread-local Phi value and store + statement::For< + 1, seq_exec, // d + statement::Lambda< + 3, Segs<0, 1, 2, 3>, + Params<0, 1, 2>, + Offsets<0, 1, 2, 3>>> // d + > // m + >, // z + + // finish tile over directions + statement::HipSyncThreads, + + // Write out phi from thread local storage + statement::For< + 3, hip_thread_x_direct, // z + statement::For< + 0, hip_thread_y_direct, // m + statement::Lambda<4, Segs<0, 2, 3>, + Params<2>>>>, + statement::HipSyncThreads + + > // Tile z + > // g + + > // Tile d + > // Tile m + > // init shmem + > // HipKernelAsync + + >; // KernelPolicy - // - // Define statically dimensioned local arrays used in kernel - // - using shmem_L_t = RAJA::TypedLocalArray, - IM, ID>; - shmem_L_t shmem_L; - - - using shmem_psi_t = RAJA::TypedLocalArray, - ID, IG, IZ>; - shmem_psi_t shmem_psi; - - - - // - // Define our execution policy - // - - using RAJA::statement::Param; - using RAJA::Segs; - using RAJA::Params; - using RAJA::Offsets; - - using EXECPOL = - RAJA::KernelPolicy< - statement::HipKernelAsync< - statement::InitLocalMem, - // Tile outer m,d loops - statement::Tile<0, tile_fixed, seq_exec, // m - statement::Tile<1, tile_fixed, seq_exec, // d - - // Load L for m,d tile into shmem - statement::For<1, hip_thread_x_loop, // d - statement::For<0, hip_thread_y_direct, // m - statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> - > - >, - statement::HipSyncThreads, - - // Distribute g, z across blocks and tile z - statement::For<2, hip_block_y_loop, // g - statement::Tile<3, tile_fixed, hip_block_x_loop, // z - - // Load phi into thread local storage - statement::For<3, hip_thread_x_direct, // z - statement::For<0, hip_thread_y_direct, // m - statement::Lambda<2, Segs<0,2,3>, Params<2>> - > - >, - - // Load slice of psi into shmem - statement::For<3, hip_thread_x_direct, // z - statement::For<1, hip_thread_y_loop, // d (reusing y) - statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>> - > - >, - statement::HipSyncThreads, - - // Compute phi - statement::For<3, hip_thread_x_direct, // z - statement::For<0, hip_thread_y_direct, // m - - // Compute thread-local Phi value and store - statement::For<1, seq_exec, // d - statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>> - > // d - > // m - >, // z - - // finish tile over directions - statement::HipSyncThreads, - - // Write out phi from thread local storage - statement::For<3, hip_thread_x_direct, // z - statement::For<0, hip_thread_y_direct, // m - statement::Lambda<4, Segs<0,2,3>, Params<2>> - > - >, - statement::HipSyncThreads - - > // Tile z - > // g - - > // Tile d - > // Tile m - > // init shmem - > // HipKernelAsync - - >; // KernelPolicy - - - - - RAJA::Timer timer; - hipErrchk( hipDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel_param( - RAJA::make_tuple( - RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)), - - // For kernel_param, second arg is a tuple of data objects used in lambdas. - // They are the last args in all lambdas (after indices). - // Here, the last entry '0.0' yields a thread-private temporary for - // computing a phi value, for shared memory before writing to phi array. - RAJA::make_tuple( shmem_L, - shmem_psi, - 0.0), - - // Lambda<0> : Load L into shmem - [=] RAJA_DEVICE (IM m, ID d, - shmem_L_t& sh_L, - IM tm, ID td) { - sh_L(tm, td) = L(m, d); - }, - - // Lambda<1> : Load slice of psi into shmem - [=] RAJA_DEVICE (ID d, IG g, IZ z, - shmem_psi_t& sh_psi, - ID td, IG tg, IZ tz) { - - sh_psi(td, tg, tz) = psi(d, g, z); - }, - - // Lambda<2> : Load thread-local phi value - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi_local = phi(m, g, z); - }, - - // Lambda<3> Compute thread-local phi value - [=] RAJA_DEVICE (IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), - IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z), - shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local, - IM tm, ID td, IG tg, IZ tz) { - - phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); - }, - - // Lambda<4> : Store phi - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi(m, g, z) = phi_local; - } - - ); - - hipDeviceSynchronize(); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA HIP + shmem version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + RAJA::Timer timer; + hipErrchk(hipDeviceSynchronize()); + timer.start(); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)), + + // For kernel_param, second arg is a tuple of data objects used in + // lambdas. They are the last args in all lambdas (after indices). + // Here, the last entry '0.0' yields a thread-private temporary for + // computing a phi value, for shared memory before writing to phi + // array. + RAJA::make_tuple(shmem_L, shmem_psi, 0.0), + + // Lambda<0> : Load L into shmem + [=] RAJA_DEVICE(IM m, ID d, shmem_L_t & sh_L, IM tm, ID td) + { sh_L(tm, td) = L(m, d); }, + + // Lambda<1> : Load slice of psi into shmem + [=] RAJA_DEVICE(ID d, IG g, IZ z, shmem_psi_t & sh_psi, ID td, IG tg, + IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); }, + + // Lambda<2> : Load thread-local phi value + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi_local = phi(m, g, z); }, + + // Lambda<3> Compute thread-local phi value + [=] RAJA_DEVICE(IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), + IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z), + shmem_L_t & sh_L, shmem_psi_t & sh_psi, + double& phi_local, IM tm, ID td, IG tg, IZ tz) + { phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); }, + + // Lambda<4> : Store phi + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi(m, g, z) = phi_local; } + + ); + + hipDeviceSynchronize(); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA HIP + shmem version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - hipErrchk( hipMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - hipMemcpyDeviceToHost ) ); + hipErrchk(hipMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + hipMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - hipErrchk( hipFree( dL_data ) ); - hipErrchk( hipFree( dpsi_data ) ); - hipErrchk( hipFree( dphi_data ) ); -} + hipErrchk(hipFree(dL_data)); + hipErrchk(hipFree(dpsi_data)); + hipErrchk(hipFree(dphi_data)); + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; @@ -2039,36 +1968,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Function to check result and report P/F. // template -void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, +void checkResult(PHIVIEW_T& phi, + LVIEW_T& L, + PSIVIEW_T& psi, const int num_m, const int num_d, const int num_g, const int num_z) { - size_t nerrors = 0; + size_t nerrors = 0; double total_error = 0.0; - for (IM m(0); m < num_m; ++m) { - for (IG g(0); g < num_g; ++g) { - for (IZ z(0); z < num_z; ++z) { + for (IM m(0); m < num_m; ++m) + { + for (IG g(0); g < num_g; ++g) + { + for (IZ z(0); z < num_z; ++z) + { double total = 0.0; - for (ID d(0); d < num_d; ++d) { + for (ID d(0); d < num_d; ++d) + { double val = L(m, d) * psi(d, g, z); total += val; } - if (std::abs(total-phi(m, g, z)) > 1e-9) { - printf("ERR: g=%d, z=%d, m=%d, val=%.12e, expected=%.12e\n", - (int)*g, (int)*z, (int)*m, phi(m,g,z), total); + if (std::abs(total - phi(m, g, z)) > 1e-9) + { + printf("ERR: g=%d, z=%d, m=%d, val=%.12e, expected=%.12e\n", (int)*g, + (int)*z, (int)*m, phi(m, g, z), total); ++nerrors; } - total_error += std::abs(total-phi(m, g, z)); + total_error += std::abs(total - phi(m, g, z)); } } } - if ( nerrors == 0 ) { + if (nerrors == 0) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL : " << nerrors << " errors!\n"; } } diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp index 331d6c51dd..3ed7dcd11d 100644 --- a/benchmark/raja_view_blur.cpp +++ b/benchmark/raja_view_blur.cpp @@ -15,117 +15,107 @@ * */ -//Uncomment to specify variant -//#define RUN_HIP_VARIANT -//#define RUN_CUDA_VARIANT -//#define RUN_SYCL_VARIANT -//#define RUN_OPENMP_VARIANT +// Uncomment to specify variant +// #define RUN_HIP_VARIANT +// #define RUN_CUDA_VARIANT +// #define RUN_SYCL_VARIANT +// #define RUN_OPENMP_VARIANT #define RUN_SEQ_VARIANT -using host_pol = RAJA::seq_exec; +using host_pol = RAJA::seq_exec; using host_resources = RAJA::resources::Host; #if defined(RAJA_ENABLE_HIP) && defined(RUN_HIP_VARIANT) -using device_pol = RAJA::hip_exec<256>; +using device_pol = RAJA::hip_exec<256>; using device_resources = RAJA::resource::Hip; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::HipKernelFixed<256, - RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>, - RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>, - RAJA::statement::Lambda<0> - > - > - > - >; +using kernel_pol = RAJA::KernelPolicy, + RAJA::statement::For<0, + RAJA::hip_global_size_x_direct<16>, + RAJA::statement::Lambda<0>>>>>; #endif #if defined(RAJA_ENABLE_CUDA) && defined(RUN_CUDA_VARIANT) -using device_pol = RAJA::cuda_exec<256>; +using device_pol = RAJA::cuda_exec<256>; using device_resources = RAJA::resources::Cuda; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixed<256, - RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>, - RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>, - RAJA::statement::Lambda<0> - > - > - > - >; +using kernel_pol = RAJA::KernelPolicy, + RAJA::statement::For<0, + RAJA::cuda_global_size_x_direct<16>, + RAJA::statement::Lambda<0>>>>>; #endif #if defined(RAJA_ENABLE_SYCL) && defined(RUN_SYCL_VARIANT) -using device_pol = RAJA::sycl_exec<256>; +using device_pol = RAJA::sycl_exec<256>; using device_resources = RAJA::resources::Sycl; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::SyclKernel< - RAJA::statement::For<1, RAJA::sycl_global_item_1, - RAJA::statement::For<0, RAJA::sycl_global_item_2, - RAJA::statement::Lambda<0> - > - > - > - >; +using kernel_pol = + RAJA::KernelPolicy>>>>; #endif #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP_VARIANT) -using device_pol = RAJA::omp_parallel_for_exec; +using device_pol = RAJA::omp_parallel_for_exec; using device_resources = RAJA::resources::Host; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - >; +using kernel_pol = RAJA::KernelPolicy>>>; #endif #if defined(RUN_SEQ_VARIANT) -using device_pol = RAJA::seq_exec; -using device_resources = RAJA::resources::Host; - -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - >; +using device_pol = RAJA::seq_exec; +using device_resources = RAJA::resources::Host; + +using kernel_pol = RAJA::KernelPolicy>>>; #endif -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { const int N = 10000; const int K = 17; - device_resources def_device_res{device_resources::get_default()}; - host_resources def_host_res{host_resources::get_default()}; + device_resources def_device_res {device_resources::get_default()}; + host_resources def_host_res {host_resources::get_default()}; auto timer = RAJA::Timer(); - //launch to intialize the stream - RAJA::forall - (RAJA::RangeSegment(0,1), [=] RAJA_HOST_DEVICE (int i) { - }); + // launch to intialize the stream + RAJA::forall(RAJA::RangeSegment(0, 1), + [=] RAJA_HOST_DEVICE(int i) {}); - int * array = def_host_res.allocate(N * N); - int * array_copy = def_host_res.allocate(N * N); + int* array = def_host_res.allocate(N * N); + int* array_copy = def_host_res.allocate(N * N); - //big array, or image - for (int i = 0; i < N * N; ++i) { - array[i] = 1; + // big array, or image + for (int i = 0; i < N * N; ++i) + { + array[i] = 1; array_copy[i] = 1; } - //small array that acts as the blur - int * kernel = def_host_res.allocate(K * K); - for (int i = 0; i < K * K; ++i) { + // small array that acts as the blur + int* kernel = def_host_res.allocate(K * K); + for (int i = 0; i < K * K; ++i) + { kernel[i] = 2; } @@ -140,7 +130,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int DIM = 2; RAJA::View> array_view(d_array, N, N); - RAJA::View> array_view_copy(d_array_copy, N, N); + RAJA::View> array_view_copy(d_array_copy, N, + N); RAJA::View> kernel_view(d_kernel, K, K); RAJA::RangeSegment range_i(0, N); @@ -148,60 +139,68 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); - RAJA::kernel - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_HOST_DEVICE (int i, int j) { - int sum = 0; - - //looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += kernel_view(m, n) * array_view(x, y); - } - } - } - - array_view(i, j) += sum; - } - ); + RAJA::kernel(RAJA::make_tuple(range_i, range_j), + [=] RAJA_HOST_DEVICE(int i, int j) + { + int sum = 0; + + // looping through the "blur" + for (int m = 0; m < K; ++m) + { + for (int n = 0; n < K; ++n) + { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever + // the blur is located on the image + if (x < N && y < N) + { + sum += kernel_view(m, n) * array_view(x, y); + } + } + } + + array_view(i, j) += sum; + }); timer.stop(); - std::cout<<"Elapsed time with RAJA view : "< - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_HOST_DEVICE (int i, int j) { - int sum = 0; - - // looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += d_kernel[m * K + n] * d_array_copy[x * N + y]; - } - } - } + timer.reset(); + timer.start(); - d_array_copy[i * N + j] += sum; - } - ); + RAJA::kernel(RAJA::make_tuple(range_i, range_j), + [=] RAJA_HOST_DEVICE(int i, int j) + { + int sum = 0; + + // looping through the "blur" + for (int m = 0; m < K; ++m) + { + for (int n = 0; n < K; ++n) + { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever + // the blur is located on the image + if (x < N && y < N) + { + sum += d_kernel[m * K + n] * + d_array_copy[x * N + y]; + } + } + } + + d_array_copy[i * N + j] += sum; + }); timer.stop(); - std::cout<<"Elapsed time with NO RAJA view : "<