From 70186a9b21e32bec04a2236b21c24249300602e2 Mon Sep 17 00:00:00 2001 From: john bowen Date: Thu, 5 Sep 2024 09:38:36 -0700 Subject: [PATCH 1/9] Add CMake configuration for style target --- .clang-format | 34 +++++++++++++++++---- CMakeLists.txt | 5 +++- cmake/RAJAMacros.cmake | 67 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 6 deletions(-) diff --git a/.clang-format b/.clang-format index 1d2ad9a77f..47b6b0bee6 100644 --- a/.clang-format +++ b/.clang-format @@ -1,15 +1,41 @@ -BasedOnStyle : google +BasedOnStyle : LLVM +# Indent formatting IndentWidth : 2 +UseTab: Never BreakBeforeBraces : Linux KeepEmptyLinesAtTheStartOfBlocks : true MaxEmptyLinesToKeep : 2 AccessModifierOffset : -2 -UseTab: Never + +# Control curly brace placement +BraceWrapping: + AfterCaseLabel: true + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: true + AfterUnion: true + AfterExternBlock: false + BeforeCatch: true + BeforeElse: true + # BeforeLambdaBody: true # available in clang 11 + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false + +# Pointer alignment +DerivePointerAlignment: false +PointerAlignment: Left +SortIncludes: false AllowShortIfStatementsOnASingleLine : true ConstructorInitializerAllOnOneLineOrOnePerLine : true AllowShortFunctionsOnASingleLine : true AllowShortLoopsOnASingleLine : false -BinPackParameters : false +BinPackParameters : true AllowAllParametersOfDeclarationOnNextLine : false AlignTrailingComments : true ColumnLimit : 80 @@ -17,11 +43,9 @@ PenaltyBreakBeforeFirstCallParameter : 100 PenaltyReturnTypeOnItsOwnLine : 65000 PenaltyBreakString : 10 -# These improve formatting results but require clang 3.6/7 or higher BreakBeforeBinaryOperators : None AlignAfterOpenBracket: true BinPackArguments : false AlignOperands : true AlwaysBreakTemplateDeclarations : true -Cpp11BracedListStyle : true diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b31cbe124..dbe5b3f113 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,7 +41,7 @@ project(RAJA LANGUAGES CXX C VERSION ${RAJA_LOADED}) set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PATH}) - +set(BLT_REQUIRED_CLANGFORMAT_VERSION "14" CACHE STRING "") include(cmake/SetupRajaOptions.cmake) cmake_minimum_required(VERSION 3.23) @@ -136,6 +136,9 @@ include(cmake/SetupCompilers.cmake) # Macros for building executables and libraries include (cmake/RAJAMacros.cmake) +# Configure `style` target for enforcing code style +raja_add_code_checks() + set (raja_sources src/AlignedRangeIndexSetBuilders.cpp src/DepGraphNode.cpp diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake index c412593db7..5233850919 100644 --- a/cmake/RAJAMacros.cmake +++ b/cmake/RAJAMacros.cmake @@ -204,3 +204,70 @@ macro(raja_add_benchmark) NUM_OMP_THREADS ${arg_NUM_OMP_THREADS} COMMAND ${TEST_DRIVER} ${arg_NAME}) endmacro(raja_add_benchmark) + +##------------------------------------------------------------------------------ +## raja_add_code_checks() +## +## Adds code checks for all source files recursively in the RAJA repository. +## +## This creates the following parent build targets: +## check - Runs a non file changing style check and CppCheck +## style - In-place code formatting +## +## Creates various child build targets that follow this pattern: +## raja_ +## raja__ +##------------------------------------------------------------------------------ +macro(raja_add_code_checks) + + set(options) + set(singleValueArgs) + set(multiValueArgs) + + # Parse the arguments to the macro + cmake_parse_arguments(arg + "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN}) + + # Only do code checks if building raja by itself and not included in + # another project + if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") + # Create file globbing expressions that only include directories that contain source + set(_base_dirs "RAJA" "examples" "exercises" "benchmark" "include" "src" "test") + set(_ext_expressions "*.cpp" "*.hpp" "*.inl" + "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh") + + set(_glob_expressions) + foreach(_exp ${_ext_expressions}) + foreach(_base_dir ${_base_dirs}) + list(APPEND _glob_expressions "${PROJECT_SOURCE_DIR}/${_base_dir}/${_exp}") + endforeach() + endforeach() + + # Glob for list of files to run code checks on + set(_sources) + file(GLOB_RECURSE _sources ${_glob_expressions}) + + # Filter out exclusions + #set(_exclude_expressions + # "${PROJECT_SOURCE_DIR}/axom/sidre/examples/lulesh2/*" + # "${PROJECT_SOURCE_DIR}/axom/slam/examples/lulesh2.0.3/*" + # "${PROJECT_SOURCE_DIR}/axom/slam/examples/tinyHydro/*") + #foreach(_exp ${_exclude_expressions}) + # list(FILTER _sources EXCLUDE REGEX ${_exp}) + #endforeach() +# + blt_add_code_checks(PREFIX RAJA + SOURCES ${_sources} + CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format + CPPCHECK_FLAGS --enable=all --inconclusive) + + # Set FOLDER property for code check targets + foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style) + set(_tgt ${arg_PREFIX}_${_suffix}) + if(TARGET ${_tgt}) + set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks") + endif() + endforeach() + endif() + +endmacro(raja_add_code_checks) From 696caf4f122e412bec0eff74a2c770f8c9bcbc50 Mon Sep 17 00:00:00 2001 From: john bowen Date: Thu, 5 Sep 2024 10:13:33 -0700 Subject: [PATCH 2/9] Refactor RAJA using the make style target --- .clang-format | 17 +- examples/dynamic-forall.cpp | 93 +- examples/dynamic_mat_transpose.cpp | 311 +- examples/forall-param-reductions.cpp | 389 +- examples/forall_multi-reductions.cpp | 114 +- examples/jacobi.cpp | 317 +- examples/kernel-dynamic-tile.cpp | 50 +- examples/launch-param-reductions.cpp | 338 +- examples/launch_flatten.cpp | 107 +- examples/launch_matrix-multiply.cpp | 765 ++-- examples/launch_reductions.cpp | 159 +- examples/memoryManager.hpp | 57 +- examples/multiview.cpp | 168 +- examples/omp-target-kernel.cpp | 32 +- examples/omp-target-ltimes.cpp | 135 +- examples/pi-reduce_vs_atomic.cpp | 130 +- examples/plugin/counter-plugin.cpp | 46 +- examples/plugin/test-plugin-dynamic.cpp | 9 +- examples/plugin/test-plugin.cpp | 10 +- examples/plugin/timer-plugin.cpp | 20 +- examples/raja-launch.cpp | 96 +- examples/red-black-gauss-seidel.cpp | 95 +- examples/resource-dynamic-forall.cpp | 117 +- examples/resource-forall.cpp | 380 +- examples/resource-kernel.cpp | 54 +- examples/resource-launch.cpp | 54 +- examples/resource-runtime-launch.cpp | 159 +- examples/tut_daxpy.cpp | 225 +- examples/tut_halo-exchange.cpp | 2172 +++++++----- examples/tut_launch_basic.cpp | 220 +- examples/tut_matrix-multiply.cpp | 1438 ++++---- examples/wave-eqn.cpp | 190 +- exercises/atomic-histogram.cpp | 151 +- exercises/atomic-histogram_solution.cpp | 193 +- exercises/dot-product.cpp | 101 +- exercises/dot-product_solution.cpp | 95 +- .../kernel-matrix-transpose-local-array.cpp | 477 +-- ...-matrix-transpose-local-array_solution.cpp | 726 ++-- exercises/kernel-matrix-transpose-tiled.cpp | 214 +- ...kernel-matrix-transpose-tiled_solution.cpp | 282 +- exercises/kernel-matrix-transpose.cpp | 85 +- .../kernel-matrix-transpose_solution.cpp | 133 +- exercises/kernelintro-execpols.cpp | 461 ++- exercises/kernelintro-execpols_solution.cpp | 512 ++- exercises/kernelintro-nested-loop-reorder.cpp | 159 +- ...rnelintro-nested-loop-reorder_solution.cpp | 209 +- .../launch-matrix-transpose-local-array.cpp | 288 +- ...-matrix-transpose-local-array_solution.cpp | 328 +- exercises/launch-matrix-transpose-tiled.cpp | 230 +- ...launch-matrix-transpose-tiled_solution.cpp | 235 +- exercises/launch-matrix-transpose.cpp | 138 +- .../launch-matrix-transpose_solution.cpp | 113 +- exercises/launchintro-execpols.cpp | 410 +-- exercises/launchintro-execpols_solution.cpp | 412 +-- exercises/memoryManager.hpp | 57 +- exercises/offset-layout-stencil.cpp | 263 +- exercises/offset-layout-stencil_solution.cpp | 295 +- .../permuted-layout-batch-matrix-multiply.cpp | 666 ++-- ...-layout-batch-matrix-multiply_solution.cpp | 749 ++-- exercises/reductions.cpp | 126 +- exercises/reductions_solution.cpp | 148 +- exercises/scan.cpp | 130 +- exercises/scan_solution.cpp | 125 +- exercises/segment-indexset-basics.cpp | 157 +- .../segment-indexset-basics_solution.cpp | 180 +- exercises/sort.cpp | 395 ++- exercises/sort_solution.cpp | 384 +- exercises/tutorial_halfday/ex2_approx-pi.cpp | 101 +- .../ex2_approx-pi_solution.cpp | 111 +- .../tutorial_halfday/ex5_line-of-sight.cpp | 133 +- .../ex5_line-of-sight_solution.cpp | 146 +- .../ex6_stencil-offset-layout.cpp | 257 +- .../ex6_stencil-offset-layout_solution.cpp | 248 +- .../ex8_tiled-matrix-transpose.cpp | 120 +- .../ex8_tiled-matrix-transpose_solution.cpp | 229 +- .../ex9_matrix-transpose-local-array.cpp | 88 +- ..._matrix-transpose-local-array_solution.cpp | 321 +- exercises/tutorial_halfday/memoryManager.hpp | 44 +- exercises/vector-addition.cpp | 204 +- exercises/vector-addition_solution.cpp | 230 +- exercises/vertexsum-indexset.cpp | 432 +-- exercises/vertexsum-indexset_solution.cpp | 448 +-- exercises/view-layout.cpp | 481 +-- exercises/view-layout_solution.cpp | 488 +-- include/RAJA/RAJA.hpp | 12 +- include/RAJA/index/IndexSet.hpp | 289 +- include/RAJA/index/IndexSetBuilders.hpp | 37 +- include/RAJA/index/IndexSetUtils.hpp | 43 +- include/RAJA/index/IndexValue.hpp | 144 +- include/RAJA/index/ListSegment.hpp | 75 +- include/RAJA/index/RangeSegment.hpp | 153 +- include/RAJA/internal/DepGraphNode.hpp | 13 +- include/RAJA/internal/Iterators.hpp | 129 +- include/RAJA/internal/MemUtils_CPU.hpp | 24 +- include/RAJA/internal/RAJAVec.hpp | 196 +- include/RAJA/internal/ThreadUtils_CPU.hpp | 4 +- include/RAJA/internal/fault_tolerance.hpp | 114 +- include/RAJA/internal/foldl.hpp | 47 +- include/RAJA/internal/get_platform.hpp | 50 +- include/RAJA/pattern/WorkGroup.hpp | 227 +- include/RAJA/pattern/WorkGroup/Dispatcher.hpp | 472 ++- include/RAJA/pattern/WorkGroup/WorkRunner.hpp | 176 +- .../RAJA/pattern/WorkGroup/WorkStorage.hpp | 552 +-- include/RAJA/pattern/WorkGroup/WorkStruct.hpp | 62 +- include/RAJA/pattern/atomic.hpp | 63 +- include/RAJA/pattern/detail/algorithm.hpp | 27 +- include/RAJA/pattern/detail/forall.hpp | 12 +- include/RAJA/pattern/detail/multi_reduce.hpp | 177 +- include/RAJA/pattern/detail/privatizer.hpp | 15 +- include/RAJA/pattern/detail/reduce.hpp | 248 +- include/RAJA/pattern/forall.hpp | 501 +-- include/RAJA/pattern/kernel.hpp | 99 +- include/RAJA/pattern/kernel/Collapse.hpp | 8 +- include/RAJA/pattern/kernel/Conditional.hpp | 68 +- include/RAJA/pattern/kernel/For.hpp | 43 +- include/RAJA/pattern/kernel/ForICount.hpp | 35 +- include/RAJA/pattern/kernel/Hyperplane.hpp | 43 +- include/RAJA/pattern/kernel/InitLocalMem.hpp | 68 +- include/RAJA/pattern/kernel/Lambda.hpp | 188 +- include/RAJA/pattern/kernel/Param.hpp | 15 +- include/RAJA/pattern/kernel/Reduce.hpp | 10 +- include/RAJA/pattern/kernel/Region.hpp | 35 +- include/RAJA/pattern/kernel/Tile.hpp | 110 +- include/RAJA/pattern/kernel/TileTCount.hpp | 40 +- .../RAJA/pattern/kernel/internal/LoopData.hpp | 123 +- .../pattern/kernel/internal/LoopTypes.hpp | 78 +- .../pattern/kernel/internal/Statement.hpp | 17 +- .../pattern/kernel/internal/StatementList.hpp | 31 +- .../RAJA/pattern/kernel/internal/Template.hpp | 19 +- include/RAJA/pattern/launch/launch_core.hpp | 591 ++-- include/RAJA/pattern/multi_reduce.hpp | 9 +- include/RAJA/pattern/params/forall.hpp | 726 ++-- include/RAJA/pattern/params/kernel_name.hpp | 17 +- include/RAJA/pattern/params/params_base.hpp | 21 +- include/RAJA/pattern/params/reducer.hpp | 128 +- include/RAJA/pattern/reduce.hpp | 6 +- include/RAJA/pattern/region.hpp | 4 +- include/RAJA/pattern/scan.hpp | 268 +- include/RAJA/pattern/sort.hpp | 214 +- include/RAJA/pattern/synchronize.hpp | 4 +- .../RAJA/pattern/tensor/MatrixRegister.hpp | 36 +- .../RAJA/pattern/tensor/ScalarRegister.hpp | 10 +- include/RAJA/pattern/tensor/TensorBlock.hpp | 1 - include/RAJA/pattern/tensor/TensorIndex.hpp | 361 +- include/RAJA/pattern/tensor/TensorLayout.hpp | 83 +- .../RAJA/pattern/tensor/TensorRegister.hpp | 131 +- .../RAJA/pattern/tensor/VectorRegister.hpp | 13 +- .../tensor/internal/ET/BinaryOperator.hpp | 233 +- .../internal/ET/BinaryOperatorTraits.hpp | 226 +- .../tensor/internal/ET/BlockLiteral.hpp | 144 +- .../internal/ET/ExpressionTemplateBase.hpp | 231 +- .../tensor/internal/ET/MultiplyOperator.hpp | 2223 ++++++------ .../tensor/internal/ET/TensorDivide.hpp | 681 ++-- .../tensor/internal/ET/TensorLiteral.hpp | 130 +- .../tensor/internal/ET/TensorLoadStore.hpp | 361 +- .../tensor/internal/ET/TensorMultiply.hpp | 248 +- .../tensor/internal/ET/TensorMultiplyAdd.hpp | 145 +- .../tensor/internal/ET/TensorNegate.hpp | 103 +- .../internal/ET/TensorScalarLiteral.hpp | 121 +- .../tensor/internal/ET/TensorTranspose.hpp | 114 +- .../tensor/internal/ET/normalizeOperand.hpp | 99 +- .../tensor/internal/ExpressionTemplate.hpp | 1 - .../tensor/internal/MatrixMatrixMultiply.hpp | 523 +-- .../tensor/internal/MatrixRegisterImpl.hpp | 2686 +++++++------- .../pattern/tensor/internal/RegisterBase.hpp | 2023 +++++------ .../tensor/internal/TensorIndexTraits.hpp | 582 ++- .../pattern/tensor/internal/TensorRef.hpp | 1256 ++++--- .../tensor/internal/TensorRegisterBase.hpp | 1539 ++++---- .../tensor/internal/TensorTileExec.hpp | 531 +-- .../tensor/internal/VectorRegisterImpl.hpp | 1768 +++++----- include/RAJA/pattern/tensor/stats.hpp | 3 +- include/RAJA/policy/MultiPolicy.hpp | 70 +- include/RAJA/policy/PolicyBase.hpp | 148 +- include/RAJA/policy/WorkGroup.hpp | 87 +- include/RAJA/policy/atomic_auto.hpp | 54 +- include/RAJA/policy/atomic_builtin.hpp | 352 +- include/RAJA/policy/cuda.hpp | 6 +- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 253 +- include/RAJA/policy/cuda/WorkGroup.hpp | 2 +- .../RAJA/policy/cuda/WorkGroup/Dispatcher.hpp | 47 +- .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp | 275 +- include/RAJA/policy/cuda/atomic.hpp | 459 +-- include/RAJA/policy/cuda/forall.hpp | 717 ++-- include/RAJA/policy/cuda/intrinsics.hpp | 157 +- include/RAJA/policy/cuda/kernel.hpp | 2 +- .../RAJA/policy/cuda/kernel/Conditional.hpp | 20 +- .../RAJA/policy/cuda/kernel/CudaKernel.hpp | 249 +- include/RAJA/policy/cuda/kernel/For.hpp | 216 +- include/RAJA/policy/cuda/kernel/ForICount.hpp | 318 +- .../RAJA/policy/cuda/kernel/Hyperplane.hpp | 42 +- .../RAJA/policy/cuda/kernel/InitLocalMem.hpp | 164 +- include/RAJA/policy/cuda/kernel/Lambda.hpp | 34 +- include/RAJA/policy/cuda/kernel/Reduce.hpp | 40 +- include/RAJA/policy/cuda/kernel/Sync.hpp | 53 +- include/RAJA/policy/cuda/kernel/Tile.hpp | 151 +- .../RAJA/policy/cuda/kernel/TileTCount.hpp | 198 +- include/RAJA/policy/cuda/kernel/internal.hpp | 515 +-- include/RAJA/policy/cuda/launch.hpp | 991 ++++-- include/RAJA/policy/cuda/multi_reduce.hpp | 552 +-- .../RAJA/policy/cuda/params/kernel_name.hpp | 56 +- include/RAJA/policy/cuda/params/reduce.hpp | 83 +- include/RAJA/policy/cuda/policy.hpp | 3126 ++++++++++------- include/RAJA/policy/cuda/raja_cudaerrchk.hpp | 35 +- include/RAJA/policy/cuda/reduce.hpp | 557 +-- include/RAJA/policy/cuda/scan.hpp | 108 +- include/RAJA/policy/cuda/sort.hpp | 555 +-- include/RAJA/policy/cuda/synchronize.hpp | 10 +- include/RAJA/policy/desul.hpp | 2 +- include/RAJA/policy/desul/atomic.hpp | 141 +- include/RAJA/policy/hip.hpp | 4 +- include/RAJA/policy/hip/MemUtils_HIP.hpp | 279 +- include/RAJA/policy/hip/WorkGroup.hpp | 2 +- .../RAJA/policy/hip/WorkGroup/Dispatcher.hpp | 40 +- .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp | 303 +- include/RAJA/policy/hip/atomic.hpp | 455 +-- include/RAJA/policy/hip/forall.hpp | 700 ++-- include/RAJA/policy/hip/intrinsics.hpp | 148 +- include/RAJA/policy/hip/kernel.hpp | 2 +- .../RAJA/policy/hip/kernel/Conditional.hpp | 22 +- include/RAJA/policy/hip/kernel/For.hpp | 221 +- include/RAJA/policy/hip/kernel/ForICount.hpp | 320 +- include/RAJA/policy/hip/kernel/HipKernel.hpp | 228 +- include/RAJA/policy/hip/kernel/Hyperplane.hpp | 42 +- .../RAJA/policy/hip/kernel/InitLocalMem.hpp | 166 +- include/RAJA/policy/hip/kernel/Lambda.hpp | 34 +- include/RAJA/policy/hip/kernel/Reduce.hpp | 59 +- include/RAJA/policy/hip/kernel/Sync.hpp | 49 +- include/RAJA/policy/hip/kernel/Tile.hpp | 151 +- include/RAJA/policy/hip/kernel/TileTCount.hpp | 192 +- include/RAJA/policy/hip/kernel/internal.hpp | 515 +-- include/RAJA/policy/hip/launch.hpp | 968 +++-- include/RAJA/policy/hip/multi_reduce.hpp | 551 +-- .../RAJA/policy/hip/params/kernel_name.hpp | 55 +- include/RAJA/policy/hip/params/reduce.hpp | 82 +- include/RAJA/policy/hip/policy.hpp | 2885 +++++++++------ include/RAJA/policy/hip/raja_hiperrchk.hpp | 32 +- include/RAJA/policy/hip/reduce.hpp | 551 +-- include/RAJA/policy/hip/scan.hpp | 110 +- include/RAJA/policy/hip/sort.hpp | 465 ++- include/RAJA/policy/hip/synchronize.hpp | 10 +- include/RAJA/policy/openmp.hpp | 6 +- include/RAJA/policy/openmp/WorkGroup.hpp | 2 +- .../policy/openmp/WorkGroup/Dispatcher.hpp | 12 +- .../policy/openmp/WorkGroup/WorkRunner.hpp | 70 +- include/RAJA/policy/openmp/atomic.hpp | 77 +- include/RAJA/policy/openmp/forall.hpp | 435 +-- include/RAJA/policy/openmp/kernel.hpp | 2 +- .../RAJA/policy/openmp/kernel/Collapse.hpp | 53 +- .../policy/openmp/kernel/OmpSyncThreads.hpp | 33 +- include/RAJA/policy/openmp/launch.hpp | 385 +- include/RAJA/policy/openmp/multi_reduce.hpp | 279 +- include/RAJA/policy/openmp/params/forall.hpp | 562 +-- .../RAJA/policy/openmp/params/kernel_name.hpp | 50 +- include/RAJA/policy/openmp/params/reduce.hpp | 52 +- include/RAJA/policy/openmp/policy.hpp | 237 +- include/RAJA/policy/openmp/reduce.hpp | 19 +- include/RAJA/policy/openmp/region.hpp | 20 +- include/RAJA/policy/openmp/scan.hpp | 121 +- include/RAJA/policy/openmp/sort.hpp | 147 +- include/RAJA/policy/openmp/synchronize.hpp | 8 +- include/RAJA/policy/openmp_target.hpp | 5 +- .../RAJA/policy/openmp_target/WorkGroup.hpp | 2 +- .../openmp_target/WorkGroup/Dispatcher.hpp | 30 +- .../openmp_target/WorkGroup/WorkRunner.hpp | 70 +- include/RAJA/policy/openmp_target/forall.hpp | 115 +- include/RAJA/policy/openmp_target/kernel.hpp | 2 +- .../policy/openmp_target/kernel/Collapse.hpp | 96 +- .../RAJA/policy/openmp_target/kernel/For.hpp | 44 +- .../openmp_target/params/kernel_name.hpp | 50 +- .../policy/openmp_target/params/reduce.hpp | 52 +- include/RAJA/policy/openmp_target/policy.hpp | 76 +- include/RAJA/policy/openmp_target/reduce.hpp | 190 +- include/RAJA/policy/sequential.hpp | 4 +- include/RAJA/policy/sequential/WorkGroup.hpp | 2 +- .../sequential/WorkGroup/Dispatcher.hpp | 14 +- .../sequential/WorkGroup/WorkRunner.hpp | 68 +- include/RAJA/policy/sequential/atomic.hpp | 49 +- include/RAJA/policy/sequential/forall.hpp | 56 +- include/RAJA/policy/sequential/kernel.hpp | 2 +- .../policy/sequential/kernel/Collapse.hpp | 30 +- .../RAJA/policy/sequential/kernel/Reduce.hpp | 10 +- include/RAJA/policy/sequential/launch.hpp | 195 +- .../RAJA/policy/sequential/multi_reduce.hpp | 88 +- .../policy/sequential/params/kernel_name.hpp | 54 +- .../RAJA/policy/sequential/params/reduce.hpp | 48 +- include/RAJA/policy/sequential/policy.hpp | 64 +- include/RAJA/policy/sequential/reduce.hpp | 6 +- include/RAJA/policy/sequential/region.hpp | 10 +- include/RAJA/policy/sequential/scan.hpp | 90 +- include/RAJA/policy/sequential/sort.hpp | 83 +- include/RAJA/policy/simd.hpp | 2 +- include/RAJA/policy/simd/forall.hpp | 46 +- include/RAJA/policy/simd/kernel/For.hpp | 34 +- include/RAJA/policy/simd/kernel/ForICount.hpp | 26 +- include/RAJA/policy/simd/launch.hpp | 30 +- include/RAJA/policy/simd/policy.hpp | 10 +- include/RAJA/policy/sycl.hpp | 4 +- include/RAJA/policy/sycl/MemUtils_SYCL.hpp | 23 +- include/RAJA/policy/sycl/forall.hpp | 264 +- include/RAJA/policy/sycl/kernel.hpp | 2 +- .../RAJA/policy/sycl/kernel/Conditional.hpp | 21 +- include/RAJA/policy/sycl/kernel/For.hpp | 199 +- include/RAJA/policy/sycl/kernel/ForICount.hpp | 256 +- include/RAJA/policy/sycl/kernel/Lambda.hpp | 34 +- .../RAJA/policy/sycl/kernel/SyclKernel.hpp | 106 +- include/RAJA/policy/sycl/kernel/Tile.hpp | 192 +- .../RAJA/policy/sycl/kernel/TileTCount.hpp | 241 +- include/RAJA/policy/sycl/kernel/internal.hpp | 151 +- include/RAJA/policy/sycl/launch.hpp | 994 +++--- .../RAJA/policy/sycl/params/kernel_name.hpp | 61 +- include/RAJA/policy/sycl/params/reduce.hpp | 53 +- include/RAJA/policy/sycl/policy.hpp | 155 +- include/RAJA/policy/sycl/reduce.hpp | 300 +- include/RAJA/policy/tensor.hpp | 2 +- include/RAJA/policy/tensor/arch.hpp | 54 +- include/RAJA/policy/tensor/arch/avx.hpp | 10 +- .../policy/tensor/arch/avx/avx_double.hpp | 889 ++--- .../RAJA/policy/tensor/arch/avx/avx_float.hpp | 928 ++--- .../RAJA/policy/tensor/arch/avx/avx_int32.hpp | 1489 ++++---- .../RAJA/policy/tensor/arch/avx/avx_int64.hpp | 1019 +++--- .../RAJA/policy/tensor/arch/avx/traits.hpp | 79 +- include/RAJA/policy/tensor/arch/avx2.hpp | 10 +- .../policy/tensor/arch/avx2/avx2_double.hpp | 1012 +++--- .../policy/tensor/arch/avx2/avx2_float.hpp | 971 ++--- .../policy/tensor/arch/avx2/avx2_int32.hpp | 1109 +++--- .../policy/tensor/arch/avx2/avx2_int64.hpp | 1037 +++--- .../RAJA/policy/tensor/arch/avx2/traits.hpp | 91 +- include/RAJA/policy/tensor/arch/avx512.hpp | 10 +- .../tensor/arch/avx512/avx512_double.hpp | 708 ++-- .../tensor/arch/avx512/avx512_float.hpp | 731 ++-- .../tensor/arch/avx512/avx512_int32.hpp | 861 ++--- .../tensor/arch/avx512/avx512_int64.hpp | 751 ++-- .../RAJA/policy/tensor/arch/avx512/traits.hpp | 80 +- include/RAJA/policy/tensor/arch/cuda.hpp | 4 +- .../policy/tensor/arch/cuda/cuda_warp.hpp | 1960 ++++++----- .../RAJA/policy/tensor/arch/cuda/traits.hpp | 31 +- include/RAJA/policy/tensor/arch/hip.hpp | 4 +- .../RAJA/policy/tensor/arch/hip/hip_wave.hpp | 1956 ++++++----- .../RAJA/policy/tensor/arch/hip/traits.hpp | 31 +- include/RAJA/policy/tensor/arch/scalar.hpp | 8 +- .../RAJA/policy/tensor/arch/scalar/scalar.hpp | 895 ++--- .../RAJA/policy/tensor/arch/scalar/traits.hpp | 85 +- include/RAJA/policy/tensor/arch_impl.hpp | 14 +- include/RAJA/policy/tensor/policy.hpp | 35 +- include/RAJA/util/BitMask.hpp | 101 +- include/RAJA/util/CombiningAdapter.hpp | 86 +- include/RAJA/util/EnableIf.hpp | 12 +- include/RAJA/util/IndexLayout.hpp | 143 +- include/RAJA/util/KokkosPluginLoader.hpp | 57 +- include/RAJA/util/Layout.hpp | 146 +- include/RAJA/util/LocalArray.hpp | 104 +- include/RAJA/util/OffsetLayout.hpp | 123 +- include/RAJA/util/OffsetOperators.hpp | 51 +- include/RAJA/util/Operators.hpp | 266 +- include/RAJA/util/Permutations.hpp | 43 +- include/RAJA/util/PermutedLayout.hpp | 16 +- include/RAJA/util/PluginContext.hpp | 28 +- include/RAJA/util/PluginLinker.hpp | 22 +- include/RAJA/util/PluginOptions.hpp | 14 +- include/RAJA/util/PluginStrategy.hpp | 28 +- include/RAJA/util/Registry.hpp | 239 +- include/RAJA/util/RepeatView.hpp | 117 +- include/RAJA/util/RuntimePluginLoader.hpp | 45 +- include/RAJA/util/SoAArray.hpp | 6 +- include/RAJA/util/SoAPtr.hpp | 75 +- include/RAJA/util/Span.hpp | 60 +- include/RAJA/util/StaticLayout.hpp | 162 +- include/RAJA/util/Timer.hpp | 15 +- include/RAJA/util/TypeConvert.hpp | 8 +- include/RAJA/util/TypedViewBase.hpp | 1278 +++---- include/RAJA/util/View.hpp | 217 +- include/RAJA/util/align.hpp | 18 +- include/RAJA/util/basic_mempool.hpp | 108 +- include/RAJA/util/camp_aliases.hpp | 2 +- include/RAJA/util/concepts.hpp | 10 +- include/RAJA/util/for_each.hpp | 40 +- include/RAJA/util/macros.hpp | 44 +- include/RAJA/util/math.hpp | 45 +- include/RAJA/util/mutex.hpp | 8 +- include/RAJA/util/plugins.hpp | 75 +- include/RAJA/util/reduce.hpp | 187 +- include/RAJA/util/resource.hpp | 295 +- include/RAJA/util/sort.hpp | 492 +-- include/RAJA/util/sycl_compat.hpp | 2 +- include/RAJA/util/types.hpp | 146 +- include/RAJA/util/zip.hpp | 104 +- include/RAJA/util/zip_tuple.hpp | 392 ++- src/AlignedRangeIndexSetBuilders.cpp | 122 +- src/DepGraphNode.cpp | 8 +- src/KokkosPluginLoader.cpp | 73 +- src/LockFreeIndexSetBuilders.cpp | 124 +- src/MemUtils_CUDA.cpp | 8 +- src/MemUtils_HIP.cpp | 8 +- src/MemUtils_SYCL.cpp | 8 +- src/PluginStrategy.cpp | 22 +- src/RuntimePluginLoader.cpp | 61 +- src/TensorStats.cpp | 13 +- ...t-dynamic-forall-resource-RangeSegment.hpp | 91 +- .../test-dynamic-forall-RangeSegment.hpp | 110 +- .../tests/test-forall-CombiningAdapter-1D.hpp | 99 +- .../tests/test-forall-CombiningAdapter-2D.hpp | 134 +- .../tests/test-forall-CombiningAdapter-3D.hpp | 222 +- .../tests/test-forall-atomic-basic.hpp | 101 +- .../tests/test-forall-AtomicRefAdd.hpp | 188 +- .../tests/test-forall-AtomicRefCAS.hpp | 187 +- .../tests/test-forall-AtomicRefLoadStore.hpp | 174 +- .../tests/test-forall-AtomicRefLogical.hpp | 271 +- .../tests/test-forall-AtomicRefMinMax.hpp | 173 +- .../tests/test-forall-AtomicRefSub.hpp | 173 +- .../tests/test-forall-AtomicMultiView.hpp | 74 +- ...test-forall-AtomicOutOfBoundsMultiView.hpp | 62 +- .../tests/test-forall-AtomicView.hpp | 52 +- .../tests/test-forall-IcountIndexSetView.hpp | 62 +- .../tests/test-forall-IndexSetView.hpp | 60 +- .../tests/test-forall-IcountIndexSet.hpp | 62 +- .../indexset/tests/test-forall-IndexSet.hpp | 55 +- .../tests/test-forall-basic-MultiReduce.hpp | 231 +- .../tests/test-forall-basic-ReduceBitAnd.hpp | 129 +- .../tests/test-forall-basic-ReduceBitOr.hpp | 134 +- .../tests/test-forall-basic-ReduceMax.hpp | 132 +- .../tests/test-forall-basic-ReduceMaxLoc.hpp | 146 +- .../tests/test-forall-basic-ReduceMin.hpp | 134 +- .../tests/test-forall-basic-ReduceMinLoc.hpp | 144 +- .../tests/test-forall-basic-ReduceSum.hpp | 132 +- .../test-forall-basic-expt-ReduceBitAnd.hpp | 147 +- .../test-forall-basic-expt-ReduceBitOr.hpp | 160 +- .../test-forall-basic-expt-ReduceMax.hpp | 161 +- .../test-forall-basic-expt-ReduceMaxLoc.hpp | 163 +- .../test-forall-basic-expt-ReduceMin.hpp | 159 +- .../test-forall-basic-expt-ReduceMinLoc.hpp | 161 +- .../test-forall-basic-expt-ReduceSum.hpp | 153 +- ...est-forall-indexset-multiple-ReduceMax.hpp | 78 +- ...-forall-indexset-multiple-ReduceMaxLoc.hpp | 95 +- ...est-forall-indexset-multiple-ReduceMin.hpp | 90 +- ...-forall-indexset-multiple-ReduceMinLoc.hpp | 95 +- ...est-forall-indexset-multiple-ReduceSum.hpp | 84 +- ...test-forall-segment-multiple-ReduceMax.hpp | 75 +- ...t-forall-segment-multiple-ReduceMaxLoc.hpp | 87 +- ...test-forall-segment-multiple-ReduceMin.hpp | 73 +- ...t-forall-segment-multiple-ReduceMinLoc.hpp | 86 +- ...test-forall-segment-multiple-ReduceSum.hpp | 111 +- .../region/tests/test-forall-region.hpp | 51 +- .../test-forall-ResourceIcountIndexSet.hpp | 64 +- .../tests/test-forall-ResourceIndexSet.hpp | 62 +- .../test-forall-resource-ListSegment.hpp | 74 +- .../test-forall-resource-RangeSegment.hpp | 71 +- ...est-forall-resource-RangeStrideSegment.hpp | 186 +- .../tests/test-forall-ListSegmentView.hpp | 143 +- .../tests/test-forall-RangeSegment2DView.hpp | 84 +- .../tests/test-forall-RangeSegmentView.hpp | 103 +- .../test-forall-RangeStrideSegmentView.hpp | 144 +- .../segment/tests/test-forall-ListSegment.hpp | 86 +- .../tests/test-forall-RangeSegment.hpp | 87 +- .../tests/test-forall-RangeStrideSegment.hpp | 183 +- .../indexset-build/test-aligned-indexset.cpp | 9 +- .../tests/basic-fission-fusion-loop-impl.hpp | 25 +- ...nel-basic-fission-fusion-loop-segments.hpp | 21 +- .../tests/basic-single-icount-loop-impl.hpp | 121 +- ...rnel-basic-single-icount-loop-segments.hpp | 92 +- .../tests/basic-single-loop-segments-impl.hpp | 116 +- ...test-kernel-basic-single-loop-segments.hpp | 115 +- ...el-resource-basic-single-loop-segments.hpp | 115 +- .../conditional-fission-fusion-loop-impl.hpp | 28 +- ...nditional-fission-fusion-loop-segments.hpp | 39 +- .../tests/test-kernel-hyperplane-2D.hpp | 151 +- .../tests/test-kernel-hyperplane-3D.hpp | 192 +- .../tests/test-kernel-nested-MultiReduce.hpp | 367 +- .../tests/nested-loop-BlockReduceSum-impl.hpp | 180 +- .../tests/nested-loop-ReduceSum-impl.hpp | 329 +- ...test-kernel-nested-loop-BlockReduceSum.hpp | 17 +- .../test-kernel-nested-loop-ReduceSum.hpp | 17 +- ...el-resource-nested-loop-BlockReduceSum.hpp | 17 +- ...-kernel-resource-nested-loop-ReduceSum.hpp | 17 +- ...test-kernel-nested-loops-segment-types.hpp | 247 +- .../test-kernel-nested-loop-OffsetView2D.hpp | 98 +- .../test-kernel-nested-loop-OffsetView3D.hpp | 111 +- ...ernel-nested-loop-PermutedOffsetView2D.hpp | 109 +- ...ernel-nested-loop-PermutedOffsetView3D.hpp | 142 +- ...test-kernel-nested-loop-PermutedView2D.hpp | 73 +- ...test-kernel-nested-loop-PermutedView3D.hpp | 90 +- .../tests/nested-loop-Basic-impl.hpp | 347 +- .../tests/nested-loop-MultiLambda-impl.hpp | 253 +- .../nested-loop-MultiLambdaParam-impl.hpp | 226 +- .../tests/test-kernel-nested-loop-Basic.hpp | 16 +- .../test-kernel-nested-loop-MultiLambda.hpp | 11 +- ...st-kernel-nested-loop-MultiLambdaParam.hpp | 12 +- ...test-kernel-resource-nested-loop-Basic.hpp | 16 +- ...ernel-resource-nested-loop-MultiLambda.hpp | 11 +- ...-resource-nested-loop-MultiLambdaParam.hpp | 12 +- .../tests/test-kernel-reduceloc-Max2D.hpp | 122 +- .../tests/test-kernel-reduceloc-Max2DView.hpp | 122 +- .../test-kernel-reduceloc-Max2DViewTuple.hpp | 121 +- .../tests/test-kernel-reduceloc-Min2D.hpp | 122 +- .../tests/test-kernel-reduceloc-Min2DView.hpp | 122 +- .../test-kernel-reduceloc-Min2DViewTuple.hpp | 121 +- .../region/tests/test-kernel-region-data.hpp | 10 +- .../region/tests/test-kernel-region-sync.hpp | 56 +- .../region/tests/test-kernel-region.hpp | 55 +- .../test-kernel-single-loop-ForICount.hpp | 45 +- .../test-kernel-single-loop-TileTCount.hpp | 45 +- .../tests/test-kernel-tile-Dynamic2D.hpp | 149 +- .../tests/test-kernel-tile-Fixed2D.hpp | 117 +- .../tests/test-kernel-tile-Fixed2DMinMax.hpp | 84 +- .../tests/test-kernel-tile-Fixed2DSum.hpp | 61 +- .../tests/test-kernel-tile-LocalArray2D.hpp | 152 +- ...kernel-resource-warp-thread-ReduceMask.hpp | 14 +- ...kernel-resource-warp-thread-ReduceWarp.hpp | 14 +- ...t-kernel-resource-warp-thread-WarpLoop.hpp | 14 +- .../test-kernel-warp-thread-ReduceMask.hpp | 14 +- .../test-kernel-warp-thread-ReduceWarp.hpp | 14 +- .../test-kernel-warp-thread-WarpLoop.hpp | 14 +- .../tests/warp-thread-ReduceMask-impl.hpp | 219 +- .../tests/warp-thread-ReduceWarp-impl.hpp | 329 +- .../tests/warp-thread-WarpLoop-impl.hpp | 234 +- .../tests/test-launch-nested-MultiReduce.hpp | 359 +- .../tests/test-launch-nested-Direct.hpp | 237 +- .../tests/test-launch-nested-Loop.hpp | 234 +- .../tests/test-launch-nested-Tile-Direct.hpp | 256 +- .../tests/test-launch-nested-Tile-Loop.hpp | 253 +- .../tests/test-launch-basic-ReduceBitAnd.hpp | 173 +- .../tests/test-launch-basic-ReduceMin.hpp | 176 +- .../tests/test-launch-basic-ReduceSum.hpp | 158 +- ...t-launch-basic-param-expt-ReduceBitAnd.hpp | 205 +- ...test-launch-basic-param-expt-ReduceMin.hpp | 206 +- ...test-launch-basic-param-expt-ReduceSum.hpp | 189 +- .../tests/test-launch-BasicShared.hpp | 107 +- .../segment/tests/test-launch-ListSegment.hpp | 134 +- .../tests/test-launch-RangeSegment.hpp | 179 +- .../tests/test-launch-RangeStrideSegment.hpp | 251 +- .../tests/test-launch-DynamicMem.hpp | 144 +- .../tests/test-launch-StaticMem.hpp | 136 +- .../test-launch-nested-Tile-iCount-Direct.hpp | 188 +- .../test-launch-nested-Tile-iCount-Loop.hpp | 191 +- .../scan/tests/test-scan-Exclusive.hpp | 77 +- .../scan/tests/test-scan-ExclusiveInplace.hpp | 71 +- .../scan/tests/test-scan-Inclusive.hpp | 63 +- .../scan/tests/test-scan-InclusiveInplace.hpp | 52 +- test/functional/scan/tests/test-scan-data.hpp | 16 +- .../matrix/test-tensor-matrix-double.hpp | 135 +- .../matrix/test-tensor-matrix-float.hpp | 73 +- .../matrix/test-tensor-matrix-int32_t.hpp | 72 +- .../matrix/test-tensor-matrix-int64_t.hpp | 144 +- .../tests/test-tensor-matrix-CtorGetSet.hpp | 59 +- .../tests/test-tensor-matrix-ET_Add.hpp | 143 +- .../tests/test-tensor-matrix-ET_Divide.hpp | 144 +- .../tests/test-tensor-matrix-ET_LoadStore.hpp | 182 +- ...-tensor-matrix-ET_MatrixMatrixMultiply.hpp | 189 +- ...nsor-matrix-ET_MatrixMatrixMultiplyAdd.hpp | 202 +- .../test-tensor-matrix-ET_MatrixVector.hpp | 168 +- .../tests/test-tensor-matrix-ET_Negate.hpp | 116 +- .../tests/test-tensor-matrix-ET_Subtract.hpp | 143 +- .../tests/test-tensor-matrix-ET_Transpose.hpp | 137 +- .../test-tensor-matrix-Load_ColMajor.hpp | 126 +- .../test-tensor-matrix-Load_RowMajor.hpp | 127 +- .../test-tensor-matrix-Store_ColMajor.hpp | 140 +- .../test-tensor-matrix-Store_RowMajor.hpp | 140 +- .../tests/test-tensor-matrix-Transpose.hpp | 59 +- .../tests/test-tensor-register-Add.hpp | 55 +- .../tests/test-tensor-register-Divide.hpp | 76 +- .../tests/test-tensor-register-DotProduct.hpp | 32 +- .../tests/test-tensor-register-FMA.hpp | 44 +- .../tests/test-tensor-register-FMS.hpp | 44 +- .../tests/test-tensor-register-Gather.hpp | 52 +- .../tests/test-tensor-register-GetSet.hpp | 121 +- .../tests/test-tensor-register-Load.hpp | 89 +- .../tests/test-tensor-register-Max.hpp | 51 +- .../tests/test-tensor-register-Min.hpp | 49 +- .../tests/test-tensor-register-Multiply.hpp | 55 +- .../tests/test-tensor-register-Scatter.hpp | 68 +- ...ensor-register-SegmentedBroadcastInner.hpp | 65 +- ...ensor-register-SegmentedBroadcastOuter.hpp | 52 +- ...st-tensor-register-SegmentedDotProduct.hpp | 48 +- ...test-tensor-register-SegmentedSumInner.hpp | 42 +- ...test-tensor-register-SegmentedSumOuter.hpp | 41 +- .../tests/test-tensor-register-Store.hpp | 101 +- .../tests/test-tensor-register-Subtract.hpp | 55 +- .../tests/test-tensor-vector-CtorGetSet.hpp | 46 +- .../tests/test-tensor-vector-FmaFms.hpp | 50 +- .../test-tensor-vector-ForallVectorRef1d.hpp | 89 +- .../test-tensor-vector-ForallVectorRef2d.hpp | 115 +- .../tests/test-tensor-vector-MinMax.hpp | 24 +- .../tests/test-tensor-vector-SumDot.hpp | 24 +- .../util/test-CombiningAdapter-1D.cpp | 21 +- .../util/test-CombiningAdapter-2D.cpp | 47 +- .../util/test-CombiningAdapter-3D.cpp | 74 +- .../util/test-PermutedCombiningAdapter-1D.cpp | 21 +- .../util/test-PermutedCombiningAdapter-2D.cpp | 44 +- .../util/test-PermutedCombiningAdapter-3D.cpp | 74 +- .../test-workgroup-Ordered-MultipleReuse.hpp | 629 ++-- .../tests/test-workgroup-Ordered-Single.hpp | 274 +- ...test-workgroup-Unordered-MultipleReuse.hpp | 577 +-- .../tests/test-workgroup-Unordered-Single.hpp | 268 +- test/include/RAJA_gtest.hpp | 277 +- test/include/RAJA_test-abs.hpp | 27 +- test/include/RAJA_test-atomic-ref-types.hpp | 91 +- test/include/RAJA_test-atomic-types.hpp | 15 +- test/include/RAJA_test-atomicpol.hpp | 88 +- test/include/RAJA_test-base.hpp | 3 +- test/include/RAJA_test-dynamic-forall.hpp | 19 +- .../RAJA_test-forall-async-execpol.hpp | 15 +- test/include/RAJA_test-forall-data.hpp | 12 +- test/include/RAJA_test-forall-execpol.hpp | 174 +- .../RAJA_test-forall-indexset-execpol.hpp | 44 +- test/include/RAJA_test-index-types.hpp | 58 +- test/include/RAJA_test-indexset-build.hpp | 64 +- .../RAJA_test-kernel-nested-loop-types.hpp | 153 +- ...launch-direct-teams-threads-1D-execpol.hpp | 74 +- ...launch-direct-teams-threads-3D-execpol.hpp | 131 +- test/include/RAJA_test-launch-execpol.hpp | 60 +- ...t-launch-loop-teams-threads-1D-execpol.hpp | 82 +- ...t-launch-loop-teams-threads-3D-execpol.hpp | 140 +- .../RAJA_test-launch-runtime-execpol.hpp | 149 +- .../RAJA_test-multi-reduce-abstractor.hpp | 217 +- test/include/RAJA_test-multi-reducepol.hpp | 29 +- test/include/RAJA_test-platform.hpp | 7 +- test/include/RAJA_test-plugin-kernelpol.hpp | 138 +- test/include/RAJA_test-plugin-launchpol.hpp | 14 +- .../RAJA_test-plugin-resource-launchpol.hpp | 14 +- test/include/RAJA_test-reduce-types.hpp | 13 +- test/include/RAJA_test-reduceloc-types.hpp | 11 +- test/include/RAJA_test-reducepol.hpp | 39 +- test/include/RAJA_test-tensor.hpp | 239 +- test/include/RAJA_test-workgroup.hpp | 292 +- test/include/RAJA_unit-test-for3d3d.hpp | 139 +- test/include/RAJA_unit-test-forone.hpp | 29 +- test/include/RAJA_unit-test-policy.hpp | 62 +- test/include/RAJA_unit-test-types.hpp | 37 +- test/include/type_helper.hpp | 45 +- .../using-with-cmake/using-with-cmake.cpp | 14 +- test/integration/plugin/plugin_to_test.cpp | 22 +- test/integration/plugin/tests/counter.hpp | 14 +- .../plugin/tests/test-plugin-forall.hpp | 157 +- .../plugin/tests/test-plugin-kernel.hpp | 42 +- .../plugin/tests/test-plugin-launch.hpp | 51 +- .../tests/test-plugin-resource-launch.hpp | 54 +- .../plugin/tests/test-plugin-workgroup.hpp | 327 +- test/integration/plugin/tests/test-plugin.hpp | 71 +- test/integration/plugin_for_test_dynamic.cpp | 10 +- test/integration/plugin_for_test_kokkos.cpp | 20 +- test/integration/test_plugin_dynamic.cpp | 2 +- test/integration/test_plugin_kokkos.cpp | 2 +- test/old-tests/unit/cpu/test-synchronize.cpp | 3 +- test/old-tests/unit/cuda/test-synchronize.cpp | 14 +- test/old-tests/unit/test-sharedmem.cpp | 1302 ++++--- test/old-tests/unit/test-simd.cpp | 88 +- .../test-algorithm-util-for_each.cpp | 60 +- .../tests/test-algorithm-reduce-utils.hpp | 299 +- .../tests/test-algorithm-sort-utils.hpp | 511 +-- .../algorithm/tests/test-algorithm-sort.hpp | 68 +- .../tests/test-algorithm-stable-sort.hpp | 68 +- .../tests/test-algorithm-util-reduce.hpp | 146 +- .../tests/test-algorithm-util-sort.hpp | 539 ++- test/unit/atomic/test-atomic-incdec.cpp | 143 +- .../unit/atomic/test-atomic-ref-accessors.cpp | 90 +- test/unit/atomic/test-atomic-ref-addsub.cpp | 125 +- test/unit/atomic/test-atomic-ref-bitwise.cpp | 172 +- .../atomic/test-atomic-ref-constructor.cpp | 100 +- .../unit/atomic/test-atomic-ref-exchanges.cpp | 198 +- test/unit/atomic/test-atomic-ref-minmax.cpp | 89 +- test/unit/atomic/test-atomic-ref.hpp | 102 +- test/unit/hip/test-synchronize.cpp | 22 +- test/unit/index/test-indexset.cpp | 37 +- test/unit/index/test-indexvalue.cpp | 7 +- test/unit/index/test-listsegment.cpp | 44 +- test/unit/index/test-rangesegment.cpp | 42 +- test/unit/index/test-rangestridesegment.cpp | 99 +- test/unit/indexing/test-indexing.hpp | 30 +- .../indexing/tests/test-indexing-global.hpp | 88 +- test/unit/internal/test-iterators.cpp | 22 +- test/unit/internal/test-rajavec.cpp | 4 +- .../unit/multi_reducer/test-multi-reducer.hpp | 34 +- .../tests/test-multi-reducer-constructors.hpp | 183 +- .../tests/test-multi-reducer-reset.hpp | 417 ++- .../test-reducer-constructors-cuda.cpp | 16 +- .../reducer/test-reducer-constructors-hip.cpp | 16 +- ...est-reducer-constructors-openmp-target.cpp | 10 +- .../test-reducer-constructors-openmp.cpp | 16 +- .../reducer/test-reducer-constructors-seq.cpp | 17 +- test/unit/reducer/test-reducer-reset-cuda.cpp | 6 +- test/unit/reducer/test-reducer-reset-hip.cpp | 6 +- .../test-reducer-reset-openmp-target.cpp | 6 +- .../reducer/test-reducer-reset-openmp.cpp | 6 +- test/unit/reducer/test-reducer-reset-seq.cpp | 7 +- test/unit/reducer/test-reducer.hpp | 18 +- .../tests/test-reducer-constructors.hpp | 145 +- .../unit/reducer/tests/test-reducer-reset.hpp | 177 +- .../tests/test-resource-AsyncTime.hpp | 59 +- .../test-resource-BasicAsyncSemantics.hpp | 34 +- .../resource/tests/test-resource-Depends.hpp | 46 +- .../test-resource-JoinAsyncSemantics.hpp | 35 +- .../tests/test-resource-MultiStream.hpp | 64 +- .../test-operators-bitwise-modulus.cpp | 41 +- .../operator/test-operators-equivalence.cpp | 104 +- .../util/operator/test-operators-identity.cpp | 44 +- .../util/operator/test-operators-logical.cpp | 55 +- .../util/operator/test-operators-math.cpp | 52 +- test/unit/util/test-float-limits.cpp | 7 +- test/unit/util/test-fraction.cpp | 17 +- test/unit/util/test-integral-limits.cpp | 7 +- test/unit/util/test-math.cpp | 17 +- test/unit/util/test-span.cpp | 41 +- test/unit/util/test-span.hpp | 40 +- test/unit/util/test-timer.cpp | 8 +- test/unit/view-layout/test-indexlayout.cpp | 215 +- test/unit/view-layout/test-makelayout.cpp | 115 +- test/unit/view-layout/test-multiview.cpp | 275 +- .../unit/view-layout/test-standard-layout.cpp | 15 +- test/unit/view-layout/test-typedlayout.cpp | 119 +- test/unit/view-layout/test-typedview.cpp | 169 +- .../tests/test-util-workgroup-Enqueue.hpp | 12 +- .../tests/test-util-workgroup-WorkStorage.hpp | 29 +- .../tests/test-workgroup-Constructor.hpp | 186 +- .../tests/test-workgroup-Dispatcher.hpp | 296 +- .../tests/test-workgroup-Enqueue-Multiple.hpp | 184 +- .../tests/test-workgroup-Enqueue-Single.hpp | 180 +- ...test-workgroup-WorkStorage-Constructor.hpp | 29 +- .../test-workgroup-WorkStorage-InsertCall.hpp | 36 +- .../test-workgroup-WorkStorage-Iterator.hpp | 44 +- .../test-workgroup-WorkStorage-Multiple.hpp | 116 +- 719 files changed, 70916 insertions(+), 60452 deletions(-) diff --git a/.clang-format b/.clang-format index 47b6b0bee6..ca4ac0cd75 100644 --- a/.clang-format +++ b/.clang-format @@ -2,12 +2,14 @@ BasedOnStyle : LLVM # Indent formatting IndentWidth : 2 UseTab: Never -BreakBeforeBraces : Linux KeepEmptyLinesAtTheStartOfBlocks : true MaxEmptyLinesToKeep : 2 AccessModifierOffset : -2 +# This must be off so that include order in RAJA is preserved +SortIncludes: false # Control curly brace placement +BreakBeforeBraces : Custom BraceWrapping: AfterCaseLabel: true AfterClass: true @@ -30,22 +32,17 @@ BraceWrapping: # Pointer alignment DerivePointerAlignment: false PointerAlignment: Left -SortIncludes: false AllowShortIfStatementsOnASingleLine : true -ConstructorInitializerAllOnOneLineOrOnePerLine : true AllowShortFunctionsOnASingleLine : true AllowShortLoopsOnASingleLine : false -BinPackParameters : true AllowAllParametersOfDeclarationOnNextLine : false AlignTrailingComments : true +BinPackArguments : false +BinPackParameters : false +ConstructorInitializerAllOnOneLineOrOnePerLine : true ColumnLimit : 80 -PenaltyBreakBeforeFirstCallParameter : 100 -PenaltyReturnTypeOnItsOwnLine : 65000 -PenaltyBreakString : 10 -BreakBeforeBinaryOperators : None AlignAfterOpenBracket: true -BinPackArguments : false AlignOperands : true AlwaysBreakTemplateDeclarations : true - +BreakBeforeBinaryOperators : None diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp index 5131010bd6..5e81a19681 100644 --- a/examples/dynamic-forall.cpp +++ b/examples/dynamic-forall.cpp @@ -28,22 +28,26 @@ void checkResult(int* res, int len); void printResult(int* res, int len); -using policy_list = camp::list - ,RAJA::cuda_exec<512> + , + RAJA::cuda_exec<256>, + RAJA::cuda_exec<512> #endif >; -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the policy to run"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the " + "policy to run"); } // @@ -55,58 +59,61 @@ int main(int argc, char *argv[]) const int pol = std::stoi(argv[1]); std::cout << "\n\nRAJA vector addition example...\n"; - std::cout << "Using policy # "<(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = -i; b[i] = i; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } // _cstyle_vector_add_end checkResult(c, N); -//printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// Example of dynamic policy selection for forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Example of dynamic policy selection for forall + //----------------------------------------------------------------------------// - //policy is chosen from the list - RAJA::expt::dynamic_forall(pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { - c[i] = a[i] + b[i]; - }); + // policy is chosen from the list + RAJA::expt::dynamic_forall( + pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) { + c[i] = a[i] + b[i]; + }); // _rajaseq_vector_add_end checkResult(c, N); -//printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// -// Clean up. -// + //----------------------------------------------------------------------------// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -122,12 +129,19 @@ int main(int argc, char *argv[]) void checkResult(int* res, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( res[i] != 0 ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (res[i] != 0) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -138,7 +152,8 @@ void checkResult(int* res, int len) void printResult(int* res, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << res[i] << std::endl; } std::cout << std::endl; diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp index feb5247224..83b946b732 100644 --- a/examples/dynamic_mat_transpose.cpp +++ b/examples/dynamic_mat_transpose.cpp @@ -83,99 +83,110 @@ using launch_policy = RAJA::LaunchPolicy< * Define team policies. * Up to 3 dimension are supported: x,y,z */ -using outer0 = RAJA::LoopPolicy< - RAJA::seq_exec +using outer0 = RAJA::LoopPolicy; + >; using outer1 = RAJA::LoopPolicy< #if defined(RAJA_ENABLE_OPENMP) - RAJA::omp_for_exec + RAJA::omp_for_exec #else - RAJA::seq_exec + RAJA::seq_exec #endif #if defined(RAJA_ENABLE_CUDA) - , - RAJA::cuda_block_y_direct + , + RAJA::cuda_block_y_direct #endif #if defined(RAJA_ENABLE_HIP) - , - RAJA::hip_block_y_direct + , + RAJA::hip_block_y_direct #endif #if defined(RAJA_ENABLE_SYCL) - , - RAJA::sycl_group_1_direct + , + RAJA::sycl_group_1_direct #endif - >; + >; /* * Define thread policies. * Up to 3 dimension are supported: x,y,z */ -using inner0 = RAJA::LoopPolicy< - RAJA::seq_exec +using inner0 = RAJA::LoopPolicy; + >; using inner1 = RAJA::LoopPolicy; + >; -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or " + "./dynamic_mat_transpose device"); } // // Run time policy section is demonstrated in this example by specifying // kernel exection space as a command line argument (host or device). - // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device + // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose + // device // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or " + "./dynamic_mat_transpose device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<(N_r * N_c); - int *At = host_res.allocate(N_r * N_c); + int* A = host_res.allocate(N_r * N_c); + int* At = host_res.allocate(N_r * N_c); // // In the following implementations of matrix transpose, we // use RAJA 'View' objects to access the matrix data. A RAJA view @@ -225,12 +238,14 @@ int main(int argc, char *argv[]) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of shared matrix transpose...\n"; @@ -241,8 +256,10 @@ int main(int argc, char *argv[]) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -253,14 +270,17 @@ int main(int argc, char *argv[]) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -272,19 +292,21 @@ int main(int argc, char *argv[]) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _dynamic_mattranspose_localarray_cstyle_end @@ -294,24 +316,26 @@ int main(int argc, char *argv[]) //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n"; + std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory " + "...\n"; - //Reset memory + // Reset memory std::memset(At, 0, N_r * N_c * sizeof(int)); #if defined(RAJA_GPU_ACTIVE) - //Allocate device side pointers + // Allocate device side pointers int *d_A = nullptr, *d_At = nullptr; - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { - d_A = device_res.allocate(N_r * N_c); + d_A = device_res.allocate(N_r * N_c); d_At = device_res.allocate(N_r * N_c); device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c); device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c); - //switch host/device pointers so we can reuse the views + // switch host/device pointers so we can reuse the views Aview.set_data(d_A); Atview.set_data(d_At); } @@ -322,65 +346,71 @@ int main(int argc, char *argv[]) // _dynamic_mattranspose_shared_mem_end // _dynamic_mattranspose_kernel_start - RAJA::launch - (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr), - RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size), - "Matrix tranpose with dynamic shared memory kernel", - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) - { - RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){ - RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){ - - //Request memory from shared memory pool - int * tile_ptr = ctx.getSharedMemory(TILE_DIM * TILE_DIM); - - //Use RAJA View for simplified indexing - RAJA::View> Tile(tile_ptr, TILE_DIM, TILE_DIM); - - RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){ - RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){ - - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index - - // Bounds check - if (row < N_r && col < N_c) { - Tile(ty,tx) = Aview(row, col); - } - - }); - }); - - //Barrier is needed to ensure all threads have written to Tile - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){ - RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){ - - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index - - // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile(ty, tx); - } - - }); + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr), + RAJA::Threads(TILE_DIM, TILE_DIM), + dynamic_shared_mem_size), + "Matrix tranpose with dynamic shared memory kernel", + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimr), [&](int by) { + RAJA::loop( + ctx, RAJA::RangeSegment(0, outer_Dimc), [&](int bx) { + // Request memory from shared memory pool + int* tile_ptr = ctx.getSharedMemory(TILE_DIM * TILE_DIM); + + // Use RAJA View for simplified indexing + RAJA::View> Tile( + tile_ptr, TILE_DIM, TILE_DIM); + + RAJA::loop( + ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int ty) { + RAJA::loop( + ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int tx) { + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) + { + Tile(ty, tx) = Aview(row, col); + } + }); + }); + + // Barrier is needed to ensure all threads have written to Tile + ctx.teamSync(); + + RAJA::loop( + ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int ty) { + RAJA::loop( + ctx, RAJA::RangeSegment(0, TILE_DIM), [&](int tx) { + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) + { + Atview(col, row) = Tile(ty, tx); + } + }); + }); + + // The launch context uses bump style allocator in which calls + // to getSharedMemory moves a memory buffer pointer to return + // different segments of shared memory. To avoid requesting + // beyond the pre-allocated memory quantity we reset the + // allocator offset counter in the launch context effectively + // releasing shared memory. + ctx.releaseSharedMemory(); }); - - //The launch context uses bump style allocator in which calls - //to getSharedMemory moves a memory buffer pointer to return - //different segments of shared memory. To avoid requesting beyond - //the pre-allocated memory quantity we reset the allocator offset counter - //in the launch context effectively releasing shared memory. - ctx.releaseSharedMemory(); - }); + }); }); - }); // _dynamic_mattranspose_kernel_end #if defined(RAJA_GPU_ACTIVE) - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c); device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c); @@ -392,15 +422,16 @@ int main(int argc, char *argv[]) checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// - //Release data + // Release data host_res.deallocate(A); host_res.deallocate(At); #if defined(RAJA_GPU_ACTIVE) - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { device_res.deallocate(d_A); device_res.deallocate(d_At); } @@ -418,16 +449,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -439,11 +476,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - //std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) //<< std::endl; - printf("%d ",Atview(row, col)); + printf("%d ", Atview(row, col)); } std::cout << "" << std::endl; } diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp index fb82582704..9779f6c02b 100644 --- a/examples/forall-param-reductions.cpp +++ b/examples/forall-param-reductions.cpp @@ -47,28 +47,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // RAJA::resources::Host host_res; int* a = host_res.allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; a[minloc_ref] = -100; @@ -76,36 +80,36 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::TypedRangeSegment arange(0, N); // _reductions_range_end -// -// Define ValLoc Type -// + // + // Define ValLoc Type + // using VALLOC_INT = RAJA::expt::ValLoc; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; // _reductions_raja_seq_start - using EXEC_POL1 = RAJA::seq_exec; + using EXEC_POL1 = RAJA::seq_exec; int seq_sum = 0; int seq_min = std::numeric_limits::max(); @@ -113,46 +117,52 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT seq_minloc(std::numeric_limits::max(), -1); VALLOC_INT seq_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(host_res, arange, - RAJA::expt::Reduce(&seq_sum), - RAJA::expt::Reduce(&seq_min), - RAJA::expt::Reduce(&seq_max), - RAJA::expt::Reduce(&seq_minloc), - RAJA::expt::Reduce(&seq_maxloc), - RAJA::expt::KernelName("RAJA Reduce Seq Kernel"), - [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) { - _seq_sum += a[i]; - - _seq_min = RAJA_MIN(a[i], _seq_min); - _seq_max = RAJA_MAX(a[i], _seq_max); - - _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc); - _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc); - //_seq_minloc.min(a[i], i); - //_seq_maxloc.max(a[i], i); - // Note : RAJA::expt::ValLoc objects provide min() and max() methods - // that are equivalent to the assignments with RAJA_MIN and RAJA_MAX - // above. - } - ); + RAJA::forall( + host_res, + arange, + RAJA::expt::Reduce(&seq_sum), + RAJA::expt::Reduce(&seq_min), + RAJA::expt::Reduce(&seq_max), + RAJA::expt::Reduce(&seq_minloc), + RAJA::expt::Reduce(&seq_maxloc), + RAJA::expt::KernelName("RAJA Reduce Seq Kernel"), + [=](int i, + int& _seq_sum, + int& _seq_min, + int& _seq_max, + VALLOC_INT& _seq_minloc, + VALLOC_INT& _seq_maxloc) { + _seq_sum += a[i]; + + _seq_min = RAJA_MIN(a[i], _seq_min); + _seq_max = RAJA_MAX(a[i], _seq_max); + + _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc); + _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc); + //_seq_minloc.min(a[i], i); + //_seq_maxloc.max(a[i], i); + // Note : RAJA::expt::ValLoc objects provide min() and max() methods + // that are equivalent to the assignments with RAJA_MIN and + // RAJA_MAX above. + }); std::cout << "\tsum = " << seq_sum << std::endl; std::cout << "\tmin = " << seq_min << std::endl; std::cout << "\tmax = " << seq_max << std::endl; std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " - << seq_minloc.getLoc() << std::endl; + << seq_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " - << seq_maxloc.getLoc() << std::endl; + << seq_maxloc.getLoc() << std::endl; // _reductions_raja_seq_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; // _reductions_raja_omppolicy_start - using EXEC_POL2 = RAJA::omp_parallel_for_exec; + using EXEC_POL2 = RAJA::omp_parallel_for_exec; // _reductions_raja_omppolicy_end int omp_sum = 0; @@ -161,37 +171,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(host_res, arange, - RAJA::expt::Reduce(&omp_sum), - RAJA::expt::Reduce(&omp_min), - RAJA::expt::Reduce(&omp_max), - RAJA::expt::Reduce(&omp_minloc), - RAJA::expt::Reduce(&omp_maxloc), - RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"), - [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) { - _omp_sum += a[i]; - - _omp_min = RAJA_MIN(a[i], _omp_min); - _omp_max = RAJA_MAX(a[i], _omp_max); - - _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc); - _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc); - //_omp_minloc.min(a[i], i); - //_omp_maxloc.max(a[i], i); - } - ); + RAJA::forall( + host_res, + arange, + RAJA::expt::Reduce(&omp_sum), + RAJA::expt::Reduce(&omp_min), + RAJA::expt::Reduce(&omp_max), + RAJA::expt::Reduce(&omp_minloc), + RAJA::expt::Reduce(&omp_maxloc), + RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"), + [=](int i, + int& _omp_sum, + int& _omp_min, + int& _omp_max, + VALLOC_INT& _omp_minloc, + VALLOC_INT& _omp_maxloc) { + _omp_sum += a[i]; + + _omp_min = RAJA_MIN(a[i], _omp_min); + _omp_max = RAJA_MAX(a[i], _omp_max); + + _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc); + _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc); + //_omp_minloc.min(a[i], i); + //_omp_maxloc.max(a[i], i); + }); std::cout << "\tsum = " << omp_sum << std::endl; std::cout << "\tmin = " << omp_min << std::endl; std::cout << "\tmax = " << omp_max << std::endl; std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " - << omp_minloc.getLoc() << std::endl; + << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " - << omp_maxloc.getLoc() << std::endl; + << omp_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_TARGET_OPENMP) std::cout << "\n Running RAJA OpenMP Target reductions...\n"; @@ -199,7 +215,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::resources::Omp omp_res; // _reductions_raja_omppolicy_start - using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt; + using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt; // _reductions_raja_omppolicy_end int omp_t_sum = 0; @@ -208,38 +224,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_t_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_t_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(omp_res, arange, - RAJA::expt::Reduce(&omp_t_sum), - RAJA::expt::Reduce(&omp_t_min), - RAJA::expt::Reduce(&omp_t_max), - RAJA::expt::Reduce(&omp_t_minloc), - RAJA::expt::Reduce(&omp_t_maxloc), - RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"), - [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) { - _omp_t_sum += a[i]; - - _omp_t_min = RAJA_MIN(a[i], _omp_t_min); - _omp_t_max = RAJA_MAX(a[i], _omp_t_max); - - _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc); - _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc); - //_omp_t_minloc.min(a[i], i); - //_omp_t_maxloc.max(a[i], i); - } - ); + RAJA::forall( + omp_res, + arange, + RAJA::expt::Reduce(&omp_t_sum), + RAJA::expt::Reduce(&omp_t_min), + RAJA::expt::Reduce(&omp_t_max), + RAJA::expt::Reduce(&omp_t_minloc), + RAJA::expt::Reduce(&omp_t_maxloc), + RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"), + [=](int i, + int& _omp_t_sum, + int& _omp_t_min, + int& _omp_t_max, + VALLOC_INT& _omp_t_minloc, + VALLOC_INT& _omp_t_maxloc) { + _omp_t_sum += a[i]; + + _omp_t_min = RAJA_MIN(a[i], _omp_t_min); + _omp_t_max = RAJA_MAX(a[i], _omp_t_max); + + _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc); + _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc); + //_omp_t_minloc.min(a[i], i); + //_omp_t_maxloc.max(a[i], i); + }); std::cout << "\tsum = " << omp_t_sum << std::endl; std::cout << "\tmin = " << omp_t_min << std::endl; std::cout << "\tmax = " << omp_t_max << std::endl; std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , " - << omp_t_minloc.getLoc() << std::endl; + << omp_t_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " - << omp_t_maxloc.getLoc() << std::endl; + << omp_t_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; @@ -250,7 +272,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) cuda_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_cudapolicy_start - using EXEC_POL3 = RAJA::cuda_exec; + using EXEC_POL3 = RAJA::cuda_exec; // _reductions_raja_cudapolicy_end int cuda_sum = 0; @@ -259,37 +281,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT cuda_minloc(std::numeric_limits::max(), -1); VALLOC_INT cuda_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(cuda_res, arange, - RAJA::expt::Reduce(&cuda_sum), - RAJA::expt::Reduce(&cuda_min), - RAJA::expt::Reduce(&cuda_max), - RAJA::expt::Reduce(&cuda_minloc), - RAJA::expt::Reduce(&cuda_maxloc), - RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"), - [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) { - _cuda_sum += d_a[i]; - - _cuda_min = RAJA_MIN(d_a[i], _cuda_min); - _cuda_max = RAJA_MAX(d_a[i], _cuda_max); - - _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc); - _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc); - //_cuda_minloc.min(a[i], i); - //_cuda_maxloc.max(a[i], i); - } - ); + RAJA::forall( + cuda_res, + arange, + RAJA::expt::Reduce(&cuda_sum), + RAJA::expt::Reduce(&cuda_min), + RAJA::expt::Reduce(&cuda_max), + RAJA::expt::Reduce(&cuda_minloc), + RAJA::expt::Reduce(&cuda_maxloc), + RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"), + [=] RAJA_DEVICE(int i, + int& _cuda_sum, + int& _cuda_min, + int& _cuda_max, + VALLOC_INT& _cuda_minloc, + VALLOC_INT& _cuda_maxloc) { + _cuda_sum += d_a[i]; + + _cuda_min = RAJA_MIN(d_a[i], _cuda_min); + _cuda_max = RAJA_MAX(d_a[i], _cuda_max); + + _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc); + _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc); + //_cuda_minloc.min(a[i], i); + //_cuda_maxloc.max(a[i], i); + }); std::cout << "\tsum = " << cuda_sum << std::endl; std::cout << "\tmin = " << cuda_min << std::endl; std::cout << "\tmax = " << cuda_max << std::endl; std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " - << cuda_minloc.getLoc() << std::endl; + << cuda_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " - << cuda_maxloc.getLoc() << std::endl; + << cuda_maxloc.getLoc() << std::endl; cuda_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; @@ -300,7 +328,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hip_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_hippolicy_start - using EXEC_POL3 = RAJA::hip_exec; + using EXEC_POL3 = RAJA::hip_exec; // _reductions_raja_hippolicy_end int hip_sum = 0; @@ -309,38 +337,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT hip_minloc(std::numeric_limits::max(), -1); VALLOC_INT hip_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(arange, - RAJA::expt::Reduce(&hip_sum), - RAJA::expt::Reduce(&hip_min), - RAJA::expt::Reduce(&hip_max), - RAJA::expt::Reduce(&hip_minloc), - RAJA::expt::Reduce(&hip_maxloc), - RAJA::expt::KernelName("RAJA Reduce HIP Kernel"), - [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) { - _hip_sum += d_a[i]; - - _hip_min = RAJA_MIN(d_a[i], _hip_min); - _hip_max = RAJA_MAX(d_a[i], _hip_max); - - _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc); - _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc); - //_hip_minloc.min(d_a[i], i); - //_hip_maxloc.max(d_a[i], i); - } - ); + RAJA::forall( + arange, + RAJA::expt::Reduce(&hip_sum), + RAJA::expt::Reduce(&hip_min), + RAJA::expt::Reduce(&hip_max), + RAJA::expt::Reduce(&hip_minloc), + RAJA::expt::Reduce(&hip_maxloc), + RAJA::expt::KernelName("RAJA Reduce HIP Kernel"), + [=] RAJA_DEVICE(int i, + int& _hip_sum, + int& _hip_min, + int& _hip_max, + VALLOC_INT& _hip_minloc, + VALLOC_INT& _hip_maxloc) { + _hip_sum += d_a[i]; + + _hip_min = RAJA_MIN(d_a[i], _hip_min); + _hip_max = RAJA_MAX(d_a[i], _hip_max); + + _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc); + _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc); + //_hip_minloc.min(d_a[i], i); + //_hip_maxloc.max(d_a[i], i); + }); std::cout << "\tsum = " << hip_sum << std::endl; std::cout << "\tmin = " << hip_min << std::endl; std::cout << "\tmax = " << hip_max << std::endl; std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " - << hip_maxloc.getLoc() << std::endl; + << hip_maxloc.getLoc() << std::endl; hip_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL reductions...\n"; @@ -351,7 +384,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) sycl_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_syclpolicy_start - using EXEC_POL3 = RAJA::sycl_exec; + using EXEC_POL3 = RAJA::sycl_exec; // _reductions_raja_syclpolicy_end int sycl_sum = 0; @@ -360,42 +393,48 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT sycl_minloc(std::numeric_limits::max(), -1); VALLOC_INT sycl_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(sycl_res, arange, - RAJA::expt::Reduce(&sycl_sum), - RAJA::expt::Reduce(&sycl_min), - RAJA::expt::Reduce(&sycl_max), - RAJA::expt::Reduce(&sycl_minloc), - RAJA::expt::Reduce(&sycl_maxloc), - RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"), - [=] RAJA_DEVICE (int i, int &_sycl_sum, int &_sycl_min, int &_sycl_max, VALLOC_INT &_sycl_minloc, VALLOC_INT &_sycl_maxloc) { - _sycl_sum += d_a[i]; - - _sycl_min = RAJA_MIN(d_a[i], _sycl_min); - _sycl_max = RAJA_MAX(d_a[i], _sycl_max); - - _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc); - _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc); - //_sycl_minloc.min(d_a[i], i); - //_sycl_maxloc.max(d_a[i], i); - } - ); + RAJA::forall( + sycl_res, + arange, + RAJA::expt::Reduce(&sycl_sum), + RAJA::expt::Reduce(&sycl_min), + RAJA::expt::Reduce(&sycl_max), + RAJA::expt::Reduce(&sycl_minloc), + RAJA::expt::Reduce(&sycl_maxloc), + RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"), + [=] RAJA_DEVICE(int i, + int& _sycl_sum, + int& _sycl_min, + int& _sycl_max, + VALLOC_INT& _sycl_minloc, + VALLOC_INT& _sycl_maxloc) { + _sycl_sum += d_a[i]; + + _sycl_min = RAJA_MIN(d_a[i], _sycl_min); + _sycl_max = RAJA_MAX(d_a[i], _sycl_max); + + _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc); + _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc); + //_sycl_minloc.min(d_a[i], i); + //_sycl_maxloc.max(d_a[i], i); + }); std::cout << "\tsum = " << sycl_sum << std::endl; std::cout << "\tmin = " << sycl_min << std::endl; std::cout << "\tmax = " << sycl_max << std::endl; std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , " - << sycl_minloc.getLoc() << std::endl; + << sycl_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , " - << sycl_maxloc.getLoc() << std::endl; + << sycl_maxloc.getLoc() << std::endl; sycl_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // host_res.deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp index c3be312194..d8b145f9ee 100644 --- a/examples/forall_multi-reductions.cpp +++ b/examples/forall_multi-reductions.cpp @@ -27,7 +27,7 @@ * */ -template < typename t_exec_policy, typename t_multi_reduce_policy > +template struct Backend { using exec_policy = t_exec_policy; @@ -38,50 +38,51 @@ struct Backend auto example_policies = camp::make_tuple( - Backend{"Sequential"} + Backend{"Sequential"} #if defined(RAJA_ENABLE_OPENMP) - , Backend{"OpenMP"} + , + Backend{"OpenMP"} #endif #if defined(RAJA_ENABLE_CUDA) - , Backend, RAJA::cuda_multi_reduce_atomic>{"Cuda"} + , + Backend, RAJA::cuda_multi_reduce_atomic>{"Cuda"} #endif #if defined(RAJA_ENABLE_HIP) - , Backend, RAJA::hip_multi_reduce_atomic>{"Hip"} + , + Backend, RAJA::hip_multi_reduce_atomic>{"Hip"} #endif - ); +); -template < typename exec_policy, typename multi_reduce_policy > +template void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a) { - RAJA::MultiReduceSum multi_reduce_sum(num_bins); - RAJA::MultiReduceMin multi_reduce_min(num_bins); - RAJA::MultiReduceMax multi_reduce_max(num_bins); + RAJA::MultiReduceSum multi_reduce_sum(num_bins); + RAJA::MultiReduceMin multi_reduce_min(num_bins); + RAJA::MultiReduceMax multi_reduce_max(num_bins); RAJA::MultiReduceBitAnd multi_reduce_and(num_bins); - RAJA::MultiReduceBitOr multi_reduce_or(num_bins); - - RAJA::forall(arange, - [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { + RAJA::MultiReduceBitOr multi_reduce_or(num_bins); + RAJA::forall(arange, [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { int bin = bins[i]; - multi_reduce_sum[bin] += a[i]; + multi_reduce_sum[bin] += a[i]; multi_reduce_min[bin].min(a[i]); multi_reduce_max[bin].max(a[i]); - multi_reduce_and[bin] &= a[i]; - multi_reduce_or [bin] |= a[i]; - + multi_reduce_and[bin] &= a[i]; + multi_reduce_or[bin] |= a[i]; }); - for (int bin = 0; bin < num_bins; ++bin) { + for (int bin = 0; bin < num_bins; ++bin) + { std::cout << "\tsum[" << bin << "] = " << multi_reduce_sum.get(bin) << '\n'; std::cout << "\tmin[" << bin << "] = " << multi_reduce_min.get(bin) << '\n'; std::cout << "\tmax[" << bin << "] = " << multi_reduce_max.get(bin) << '\n'; std::cout << "\tand[" << bin << "] = " << multi_reduce_and.get(bin) << '\n'; - std::cout << "\tor [" << bin << "] = " << multi_reduce_or .get(bin) << '\n'; + std::cout << "\tor [" << bin << "] = " << multi_reduce_or.get(bin) << '\n'; std::cout << '\n'; } } @@ -90,77 +91,78 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) { // _multi_reductions_array_init_start -// -// Define array length -// + // + // Define array length + // const int N = 1000000; const int num_bins = 10; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // camp::resources::Host host_res; int* host_bins = host_res.template allocate(N); - int* host_a = host_res.template allocate(N); + int* host_a = host_res.template allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { host_bins[i] = i % num_bins; - host_a[i] = (i % (2*num_bins)) - num_bins; + host_a[i] = (i % (2 * num_bins)) - num_bins; } // _multi_reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// for bin in [0, num_bins) -// - the sum will be (bin - num_bins/2) * N / num_bins -// - the min will be bin - num_bins -// - the max will be bin -// - the and will be min & max -// - the or will be min | max -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // for bin in [0, num_bins) + // - the sum will be (bin - num_bins/2) * N / num_bins + // - the min will be bin - num_bins + // - the max will be bin + // - the and will be min & max + // - the or will be min | max + // + + // + // Define index range for iterating over a elements in all examples + // // _multi_reductions_range_start RAJA::RangeSegment arange(0, N); // _multi_reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// RAJA::for_each_tuple(example_policies, [&](auto const& backend) { - std::cout << "Running " << backend.name << " policies" << '\n'; using exec_policy = typename std::decay_t::exec_policy; - using multi_reduce_policy = typename std::decay_t::multi_reduce_policy; + using multi_reduce_policy = + typename std::decay_t::multi_reduce_policy; auto res = RAJA::resources::get_default_resource(); int* bins = res.template allocate(N); - int* a = res.template allocate(N); + int* a = res.template allocate(N); - res.memcpy(bins, host_bins, N*sizeof(int)); - res.memcpy(a , host_a , N*sizeof(int)); + res.memcpy(bins, host_bins, N * sizeof(int)); + res.memcpy(a, host_a, N * sizeof(int)); example_code(arange, num_bins, bins, a); res.deallocate(bins); - res.deallocate(a ); + res.deallocate(a); std::cout << std::endl; }); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // host_res.deallocate(host_bins); - host_res.deallocate(host_a ); + host_res.deallocate(host_a); std::cout << "\n DONE!...\n"; diff --git a/examples/jacobi.cpp b/examples/jacobi.cpp index 0badaa7396..8bf25d9a86 100644 --- a/examples/jacobi.cpp +++ b/examples/jacobi.cpp @@ -39,7 +39,7 @@ * (I, Iold) and initialized to zero. The first set of * nested for loops apply an iteration of the Jacobi * scheme. The scheme is only applied to the interior - * nodes. + * nodes. * * The second set of nested for loops is used to * update Iold and compute the l_2 norm of the @@ -52,7 +52,7 @@ * ----[RAJA Concepts]--------------- * - Forall::nested loop * - RAJA Reduction - * + * */ @@ -63,9 +63,9 @@ * * CUDA_BLOCK_SIZE_Y - Number of threads in the * y-dimension of a cuda thread block - * + * * CUDA_BLOCK_SIZE - Number of threads per threads block -*/ + */ #if defined(RAJA_ENABLE_CUDA) const int CUDA_BLOCK_SIZE = 256; #endif @@ -80,23 +80,24 @@ const int HIP_BLOCK_SIZE = 256; // h - Spacing between grid points // n - Number of grid points // -struct grid_s { +struct grid_s +{ double o, h; int n; }; -// +// // ----[Functions]--------- // solution - Function for the analytic solution // computeErr - Displays the maximum error in the solution // double solution(double x, double y); -void computeErr(double *I, grid_s grid); +void computeErr(double* I, grid_s grid); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout<<"Jacobi Example"<(NN); - double *Iold = memoryManager::allocate(NN); + double* I = memoryManager::allocate(NN); + double* Iold = memoryManager::allocate(NN); memset(I, 0, NN * sizeof(double)); @@ -138,23 +139,26 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) resI2 = 1; iteration = 0; - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { // // Jacobi Iteration // - for (int n = 1; n <= N; ++n) { - for (int m = 1; m <= N; ++m) { + for (int n = 1; n <= N; ++n) + { + for (int m = 1; m <= N; ++m) + { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); int id = n * (N + 2) + m; - I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] - + Iold[id + 1]); + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); } } @@ -162,12 +166,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Compute residual and update Iold // resI2 = 0.0; - for (int k = 0; k < NN; k++) { + for (int k = 0; k < NN; k++) + { resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); Iold[k] = I[k]; } - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("Standard C++ Loop - Maxed out on iterations \n"); exit(-1); } @@ -184,9 +190,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::RangeSegment gridRange(0, NN); RAJA::RangeSegment jacobiRange(1, (N + 1)); - using jacobiSeqNestedPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; + using jacobiSeqNestedPolicy = RAJA::KernelPolicy>>>; printf("RAJA: Sequential Policy - Nested ForallN \n"); resI2 = 1; @@ -195,41 +202,39 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memset(Iold, 0, NN * sizeof(double)); /* - * Sequential Jacobi Iteration. + * Sequential Jacobi Iteration. * * Note that a RAJA ReduceSum object is used to accumulate the sum - * for the residual. Since the loop is run sequentially, this is - * not strictly necessary. It is done here for consistency and + * for the residual. Since the loop is run sequentially, this is + * not strictly necessary. It is done here for consistency and * comparison with other RAJA variants in this example. - */ - while (resI2 > tol * tol) { + */ + while (resI2 > tol * tol) + { - RAJA::kernel(RAJA::make_tuple(jacobiRange,jacobiRange), - [=] (RAJA::Index_type m, RAJA::Index_type n) { - + RAJA::kernel( + RAJA::make_tuple(jacobiRange, jacobiRange), + [=](RAJA::Index_type m, RAJA::Index_type n) { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); int id = n * (N + 2) + m; - I[id] = - 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] - + Iold[id + 1]); + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); }); RAJA::ReduceSum RAJA_resI2(0.0); - RAJA::forall( - gridRange, [=](RAJA::Index_type k) { - - RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); - Iold[k] = I[k]; + RAJA::forall(gridRange, [=](RAJA::Index_type k) { + RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); + Iold[k] = I[k]; + }); - }); - resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("Jacobi: Sequential - Maxed out on iterations! \n"); exit(-1); } @@ -237,17 +242,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } computeErr(I, gridx); printf("No of iterations: %d \n \n", iteration); - - + + #if defined(RAJA_ENABLE_OPENMP) printf("RAJA: OpenMP Policy - Nested ForallN \n"); resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); memset(Iold, 0, NN * sizeof(double)); - + /* - * OpenMP parallel Jacobi Iteration. + * OpenMP parallel Jacobi Iteration. * * ----[RAJA Policies]----------- * RAJA::omp_collapse_for_exec - @@ -256,41 +261,41 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) * Note that OpenMP RAJA ReduceSum object performs the reduction * operation for the residual in a thread-safe manner. */ - - using jacobiOmpNestedPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, - RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; - - while (resI2 > tol * tol) { - - RAJA::kernel(RAJA::make_tuple(jacobiRange,jacobiRange), - [=] (RAJA::Index_type m, RAJA::Index_type n) { - - - double x = gridx.o + m * gridx.h; - double y = gridx.o + n * gridx.h; - - double f = gridx.h * gridx.h * - (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); - - int id = n * (N + 2) + m; - I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + - Iold[id - 1] + Iold[id + 1]); - }); + + using jacobiOmpNestedPolicy = RAJA::KernelPolicy>>>; + + while (resI2 > tol * tol) + { + + RAJA::kernel( + RAJA::make_tuple(jacobiRange, jacobiRange), + [=](RAJA::Index_type m, RAJA::Index_type n) { + double x = gridx.o + m * gridx.h; + double y = gridx.o + n * gridx.h; + + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + + int id = n * (N + 2) + m; + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); + }); RAJA::ReduceSum RAJA_resI2(0.0); - RAJA::forall( gridRange, - [=](RAJA::Index_type k) { - - RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); - Iold[k] = I[k]; - - }); - + RAJA::forall( + gridRange, [=](RAJA::Index_type k) { + RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); + Iold[k] = I[k]; + }); + resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("Jacobi: OpenMP - Maxed out on iterations! \n"); exit(-1); } @@ -303,7 +308,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) /* - * CUDA Jacobi Iteration. + * CUDA Jacobi Iteration. * * ----[RAJA Policies]----------- * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - @@ -315,42 +320,45 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("RAJA: CUDA Policy - Nested ForallN \n"); - using jacobiCUDANestedPolicy = RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, - RAJA::statement::For<1, RAJA::cuda_thread_y_direct, - RAJA::statement::For<0, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<0> - > - > - > - > - > >; - + using jacobiCUDANestedPolicy = + RAJA::KernelPolicy, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed<32>, + RAJA::cuda_block_x_loop, + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_direct, + RAJA::statement::For<0, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0>>>>>>>; + resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); memset(Iold, 0, NN * sizeof(double)); - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { // - // Jacobi Iteration + // Jacobi Iteration // RAJA::kernel( - RAJA::make_tuple(jacobiRange,jacobiRange), - [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) { - + RAJA::make_tuple(jacobiRange, jacobiRange), + [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); int id = n * (N + 2) + m; - I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] - + Iold[id + 1]); + I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + + Iold[id - 1] + Iold[id + 1]); }); // @@ -358,16 +366,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // RAJA::ReduceSum RAJA_resI2(0.0); RAJA::forall>( - gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { - + gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); Iold[k] = I[k]; - - }); + }); resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("RAJA: CUDA - Maxed out on iterations! \n"); exit(-1); } @@ -392,47 +399,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("RAJA: HIP Policy - Nested ForallN \n"); - using jacobiHIPNestedPolicy = RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, - RAJA::statement::For<1, RAJA::hip_thread_y_direct, - RAJA::statement::For<0, RAJA::hip_thread_x_direct, - RAJA::statement::Lambda<0> - > - > - > - > - > >; + using jacobiHIPNestedPolicy = + RAJA::KernelPolicy, + RAJA::hip_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed<32>, + RAJA::hip_block_x_loop, + RAJA::statement::For< + 1, + RAJA::hip_thread_y_direct, + RAJA::statement::For<0, + RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<0>>>>>>>; resI2 = 1; iteration = 0; memset(I, 0, NN * sizeof(double)); memset(Iold, 0, NN * sizeof(double)); - double *d_I = memoryManager::allocate_gpu(NN); - double *d_Iold = memoryManager::allocate_gpu(NN); - hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice )); + double* d_I = memoryManager::allocate_gpu(NN); + double* d_Iold = memoryManager::allocate_gpu(NN); + hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice)); - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { // // Jacobi Iteration // RAJA::kernel( - RAJA::make_tuple(jacobiRange,jacobiRange), - [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) { - + RAJA::make_tuple(jacobiRange, jacobiRange), + [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h - * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); + double f = gridx.h * gridx.h * + (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); int id = n * (N + 2) + m; - d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] - + d_Iold[id + 1]); + d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + + d_Iold[id - 1] + d_Iold[id + 1]); }); // @@ -440,23 +451,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // RAJA::ReduceSum RAJA_resI2(0.0); RAJA::forall>( - gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { - + gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]); d_Iold[k] = d_I[k]; - - }); + }); resI2 = RAJA_resI2; - if (iteration > maxIter) { + if (iteration > maxIter) + { printf("RAJA: HIP - Maxed out on iterations! \n"); exit(-1); } iteration++; } hipDeviceSynchronize(); - hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost)); computeErr(I, gridx); printf("No of iterations: %d \n \n", iteration); @@ -466,7 +476,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate(I); memoryManager::deallocate(Iold); - + return 0; } @@ -482,25 +492,26 @@ double solution(double x, double y) // // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf} // -void computeErr(double *I, grid_s grid) +void computeErr(double* I, grid_s grid) { RAJA::RangeSegment gridRange(0, grid.n); RAJA::ReduceMax tMax(-1.0); - using jacobiSeqNestedPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; - - RAJA::kernel(RAJA::make_tuple(gridRange,gridRange), - [=] (RAJA::Index_type ty, RAJA::Index_type tx ) { - - int id = tx + grid.n * ty; - double x = grid.o + tx * grid.h; - double y = grid.o + ty * grid.h; - double myErr = std::abs(I[id] - solution(x, y)); - tMax.max(myErr); - }); + using jacobiSeqNestedPolicy = RAJA::KernelPolicy>>>; + + RAJA::kernel( + RAJA::make_tuple(gridRange, gridRange), + [=](RAJA::Index_type ty, RAJA::Index_type tx) { + int id = tx + grid.n * ty; + double x = grid.o + tx * grid.h; + double y = grid.o + ty * grid.h; + double myErr = std::abs(I[id] - solution(x, y)); + tMax.max(myErr); + }); double l2err = tMax; printf("Max error = %lg, h = %f \n", l2err, grid.h); diff --git a/examples/kernel-dynamic-tile.cpp b/examples/kernel-dynamic-tile.cpp index 5de2123425..6aac29178a 100644 --- a/examples/kernel-dynamic-tile.cpp +++ b/examples/kernel-dynamic-tile.cpp @@ -1,34 +1,36 @@ #include "RAJA/RAJA.hpp" -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA dynamic_tile example...\n\n"; -//Using policy = KernelPolicy, seq_exec, …>>; -//RAJA::kernel_param( -// make_tuple(RangeSegment(0,N)), -// make_tuple(32), // param 0 is referenced by tile_dynamic -// [=](int i, int tile_size){ -// -// }); + // Using policy = KernelPolicy, seq_exec, …>>; + // RAJA::kernel_param( + // make_tuple(RangeSegment(0,N)), + // make_tuple(32), // param 0 is referenced by tile_dynamic + // [=](int i, int tile_size){ + // + // }); using namespace RAJA; - kernel_param< - KernelPolicy< - statement::Tile<1, tile_dynamic<1>, seq_exec, - statement::Tile<0, tile_dynamic<0>, seq_exec, - statement::For<1, seq_exec, - statement::For<0, seq_exec, statement::Lambda<0>> - > - > - > - > - >(make_tuple(RangeSegment{0,25}, RangeSegment{0,25}), + kernel_param, + seq_exec, + statement::Tile< + 0, + tile_dynamic<0>, + seq_exec, + statement::For<1, + seq_exec, + statement::For<0, seq_exec, statement::Lambda<0>>>>>>>( + make_tuple(RangeSegment{0, 25}, RangeSegment{0, 25}), make_tuple(TileSize{5}, TileSize{10}), - //make_tuple(TileSize(10)), // not sure we need this, good for static_assert - [=](int i, int j, TileSize x, TileSize y){ - std::cout << "Running index (" << i << "," << j << ") of " << x.size << "x" << y.size << " tile." << std::endl; - }); - + // make_tuple(TileSize(10)), // not sure we need this, good for + // static_assert + [=](int i, int j, TileSize x, TileSize y) { + std::cout << "Running index (" << i << "," << j << ") of " << x.size + << "x" << y.size << " tile." << std::endl; + }); } diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp index b57bedfd6b..2682e15edd 100644 --- a/examples/launch-param-reductions.cpp +++ b/examples/launch-param-reductions.cpp @@ -38,7 +38,7 @@ constexpr int HIP_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_SYCL) -//LC testing hardware has a limit of 151 +// LC testing hardware has a limit of 151 constexpr int SYCL_BLOCK_SIZE = 128; #endif @@ -48,14 +48,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Use a resource to allocate memory -// + // + // Use a resource to allocate memory + // RAJA::resources::Host host_res; #if defined(RAJA_ENABLE_CUDA) RAJA::resources::Cuda device_res; @@ -68,22 +68,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = host_res.allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; a[minloc_ref] = -100; @@ -91,37 +95,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::TypedRangeSegment arange(0, N); // _reductions_range_end -// -// Define ValLoc Type -// + // + // Define ValLoc Type + // using VALLOC_INT = RAJA::expt::ValLoc; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; // _reductions_raja_seq_start - using LAUNCH_POL1 = RAJA::LaunchPolicy; - using LOOP_POL1 = RAJA::LoopPolicy; + using LAUNCH_POL1 = RAJA::LaunchPolicy; + using LOOP_POL1 = RAJA::LoopPolicy; int seq_sum = 0; int seq_min = std::numeric_limits::max(); @@ -129,20 +133,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT seq_minloc(std::numeric_limits::max(), -1); VALLOC_INT seq_maxloc(std::numeric_limits::min(), -1); - RAJA::launch - (host_res, RAJA::LaunchParams(), "SeqReductionKernel", - RAJA::expt::Reduce(&seq_sum), - RAJA::expt::Reduce(&seq_min), - RAJA::expt::Reduce(&seq_max), - RAJA::expt::Reduce(&seq_minloc), - RAJA::expt::Reduce(&seq_maxloc), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_seq_sum, int &_seq_min, - int &_seq_max, VALLOC_INT &_seq_minloc, - VALLOC_INT &_seq_maxloc) { - - RAJA::loop(ctx, arange, [&] (int i) { - + RAJA::launch( + host_res, + RAJA::LaunchParams(), + "SeqReductionKernel", + RAJA::expt::Reduce(&seq_sum), + RAJA::expt::Reduce(&seq_min), + RAJA::expt::Reduce(&seq_max), + RAJA::expt::Reduce(&seq_minloc), + RAJA::expt::Reduce(&seq_maxloc), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, + int& _seq_sum, + int& _seq_min, + int& _seq_max, + VALLOC_INT& _seq_minloc, + VALLOC_INT& _seq_maxloc) { + RAJA::loop(ctx, arange, [&](int i) { _seq_sum += a[i]; _seq_min = RAJA_MIN(a[i], _seq_min); @@ -152,33 +158,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc); //_seq_minloc.min(a[i], i); //_seq_maxloc.max(a[i], i); - // Note : RAJA::expt::ValLoc objects provide min() and max() methods - // that are equivalent to the assignments with RAJA_MIN and RAJA_MAX - // above. - } - ); - - } - ); + // Note : RAJA::expt::ValLoc objects provide min() and max() + // methods + // that are equivalent to the assignments with RAJA_MIN and + // RAJA_MAX above. + }); + }); std::cout << "\tsum = " << seq_sum << std::endl; std::cout << "\tmin = " << seq_min << std::endl; std::cout << "\tmax = " << seq_max << std::endl; std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " - << seq_minloc.getLoc() << std::endl; + << seq_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " - << seq_maxloc.getLoc() << std::endl; + << seq_maxloc.getLoc() << std::endl; // _reductions_raja_seq_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; // _reductions_raja_omppolicy_start - using LAUNCH_POL2 = RAJA::LaunchPolicy; - using LOOP_POL2 = RAJA::LoopPolicy; + using LAUNCH_POL2 = RAJA::LaunchPolicy; + using LOOP_POL2 = RAJA::LoopPolicy; // _reductions_raja_omppolicy_end int omp_sum = 0; @@ -187,20 +191,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_maxloc(std::numeric_limits::min(), -1); - RAJA::launch - (host_res, RAJA::LaunchParams(), "OmpReductionKernel", - RAJA::expt::Reduce(&omp_sum), - RAJA::expt::Reduce(&omp_min), - RAJA::expt::Reduce(&omp_max), - RAJA::expt::Reduce(&omp_minloc), - RAJA::expt::Reduce(&omp_maxloc), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_omp_sum, int &_omp_min, - int &_omp_max, VALLOC_INT &_omp_minloc, - VALLOC_INT &_omp_maxloc) { - - RAJA::loop(ctx, arange, [&] (int i) { - + RAJA::launch( + host_res, + RAJA::LaunchParams(), + "OmpReductionKernel", + RAJA::expt::Reduce(&omp_sum), + RAJA::expt::Reduce(&omp_min), + RAJA::expt::Reduce(&omp_max), + RAJA::expt::Reduce(&omp_minloc), + RAJA::expt::Reduce(&omp_maxloc), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, + int& _omp_sum, + int& _omp_min, + int& _omp_max, + VALLOC_INT& _omp_minloc, + VALLOC_INT& _omp_maxloc) { + RAJA::loop(ctx, arange, [&](int i) { _omp_sum += a[i]; _omp_min = RAJA_MIN(a[i], _omp_min); @@ -210,23 +216,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc); //_omp_minloc.min(a[i], i); //_omp_maxloc.max(a[i], i); - } - ); - - } - ); + }); + }); std::cout << "\tsum = " << omp_sum << std::endl; std::cout << "\tmin = " << omp_min << std::endl; std::cout << "\tmax = " << omp_max << std::endl; std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " - << omp_minloc.getLoc() << std::endl; + << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " - << omp_maxloc.getLoc() << std::endl; + << omp_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; @@ -235,11 +238,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) device_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_cudapolicy_start - using LAUNCH_POL3 = RAJA::LaunchPolicy>; - using LOOP_POL3 = RAJA::LoopPolicy; + using LAUNCH_POL3 = RAJA::LaunchPolicy>; + using LOOP_POL3 = RAJA::LoopPolicy; // _reductions_raja_cudapolicy_end - const int NUMBER_OF_TEAMS = (N-1)/CUDA_BLOCK_SIZE + 1; + const int NUMBER_OF_TEAMS = (N - 1) / CUDA_BLOCK_SIZE + 1; int cuda_sum = 0; int cuda_min = std::numeric_limits::max(); @@ -247,21 +250,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT cuda_minloc(std::numeric_limits::max(), -1); VALLOC_INT cuda_maxloc(std::numeric_limits::min(), -1); - RAJA::launch - (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)), - "CUDAReductionKernel", - RAJA::expt::Reduce(&cuda_sum), - RAJA::expt::Reduce(&cuda_min), - RAJA::expt::Reduce(&cuda_max), - RAJA::expt::Reduce(&cuda_minloc), - RAJA::expt::Reduce(&cuda_maxloc), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_cuda_sum, int &_cuda_min, int &_cuda_max, - VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) { - - - RAJA::loop(ctx, arange, [&] (int i) { - + RAJA::launch( + device_res, + RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), + RAJA::Threads(CUDA_BLOCK_SIZE)), + "CUDAReductionKernel", + RAJA::expt::Reduce(&cuda_sum), + RAJA::expt::Reduce(&cuda_min), + RAJA::expt::Reduce(&cuda_max), + RAJA::expt::Reduce(&cuda_minloc), + RAJA::expt::Reduce(&cuda_maxloc), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, + int& _cuda_sum, + int& _cuda_min, + int& _cuda_max, + VALLOC_INT& _cuda_minloc, + VALLOC_INT& _cuda_maxloc) { + RAJA::loop(ctx, arange, [&](int i) { _cuda_sum += d_a[i]; _cuda_min = RAJA_MIN(d_a[i], _cuda_min); @@ -271,26 +276,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc); //_cuda_minloc.min(a[i], i); //_cuda_maxloc.max(a[i], i); - - } - ); - - - } - ); + }); + }); std::cout << "\tsum = " << cuda_sum << std::endl; std::cout << "\tmin = " << cuda_min << std::endl; std::cout << "\tmax = " << cuda_max << std::endl; std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " - << cuda_minloc.getLoc() << std::endl; + << cuda_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " - << cuda_maxloc.getLoc() << std::endl; + << cuda_maxloc.getLoc() << std::endl; device_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; @@ -299,11 +299,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) device_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_hippolicy_start - using LAUNCH_POL3 = RAJA::LaunchPolicy>; - using LOOP_POL3 = RAJA::LoopPolicy; + using LAUNCH_POL3 = RAJA::LaunchPolicy>; + using LOOP_POL3 = RAJA::LoopPolicy; // _reductions_raja_hippolicy_end - const int NUMBER_OF_TEAMS = (N-1)/HIP_BLOCK_SIZE + 1; + const int NUMBER_OF_TEAMS = (N - 1) / HIP_BLOCK_SIZE + 1; int hip_sum = 0; int hip_min = std::numeric_limits::max(); @@ -311,21 +311,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT hip_minloc(std::numeric_limits::max(), -1); VALLOC_INT hip_maxloc(std::numeric_limits::min(), -1); - RAJA::launch - (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)), - "HipReductionKernel", - RAJA::expt::Reduce(&hip_sum), - RAJA::expt::Reduce(&hip_min), - RAJA::expt::Reduce(&hip_max), - RAJA::expt::Reduce(&hip_minloc), - RAJA::expt::Reduce(&hip_maxloc), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_hip_sum, int &_hip_min, - int &_hip_max, VALLOC_INT &_hip_minloc, - VALLOC_INT &_hip_maxloc) { - - RAJA::loop(ctx, arange, [&] (int i) { - + RAJA::launch( + device_res, + RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), + RAJA::Threads(HIP_BLOCK_SIZE)), + "HipReductionKernel", + RAJA::expt::Reduce(&hip_sum), + RAJA::expt::Reduce(&hip_min), + RAJA::expt::Reduce(&hip_max), + RAJA::expt::Reduce(&hip_minloc), + RAJA::expt::Reduce(&hip_maxloc), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, + int& _hip_sum, + int& _hip_min, + int& _hip_max, + VALLOC_INT& _hip_minloc, + VALLOC_INT& _hip_maxloc) { + RAJA::loop(ctx, arange, [&](int i) { _hip_sum += d_a[i]; _hip_min = RAJA_MIN(d_a[i], _hip_min); @@ -335,25 +337,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc); //_hip_minloc.min(d_a[i], i); //_hip_maxloc.max(d_a[i], i); - - } - ); - - } - ); + }); + }); std::cout << "\tsum = " << hip_sum << std::endl; std::cout << "\tmin = " << hip_min << std::endl; std::cout << "\tmax = " << hip_max << std::endl; std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " - << hip_maxloc.getLoc() << std::endl; + << hip_maxloc.getLoc() << std::endl; device_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL reductions...\n"; @@ -362,11 +360,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) device_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_syclpolicy_start - using LAUNCH_POL4 = RAJA::LaunchPolicy>; - using LOOP_POL4 = RAJA::LoopPolicy; + using LAUNCH_POL4 = RAJA::LaunchPolicy>; + using LOOP_POL4 = RAJA::LoopPolicy; // _reductions_raja_syclpolicy_end - const int NUMBER_OF_TEAMS = (N-1)/SYCL_BLOCK_SIZE + 1; + const int NUMBER_OF_TEAMS = (N - 1) / SYCL_BLOCK_SIZE + 1; int sycl_sum = 0; int sycl_min = std::numeric_limits::max(); @@ -374,21 +372,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT sycl_minloc(std::numeric_limits::max(), -1); VALLOC_INT sycl_maxloc(std::numeric_limits::min(), -1); - RAJA::launch - (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)), - "SyclReductionKernel", - RAJA::expt::Reduce(&sycl_sum), - RAJA::expt::Reduce(&sycl_min), - RAJA::expt::Reduce(&sycl_max), - RAJA::expt::Reduce(&sycl_minloc), - RAJA::expt::Reduce(&sycl_maxloc), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_sycl_sum, int &_sycl_min, - int &_sycl_max, VALLOC_INT &_sycl_minloc, - VALLOC_INT &_sycl_maxloc) { - - RAJA::loop(ctx, arange, [&] (int i) { - + RAJA::launch( + device_res, + RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), + RAJA::Threads(SYCL_BLOCK_SIZE)), + "SyclReductionKernel", + RAJA::expt::Reduce(&sycl_sum), + RAJA::expt::Reduce(&sycl_min), + RAJA::expt::Reduce(&sycl_max), + RAJA::expt::Reduce(&sycl_minloc), + RAJA::expt::Reduce(&sycl_maxloc), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, + int& _sycl_sum, + int& _sycl_min, + int& _sycl_max, + VALLOC_INT& _sycl_minloc, + VALLOC_INT& _sycl_maxloc) { + RAJA::loop(ctx, arange, [&](int i) { _sycl_sum += d_a[i]; _sycl_min = RAJA_MIN(d_a[i], _sycl_min); @@ -398,29 +398,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc); //_sycl_minloc.min(d_a[i], i); //_sycl_maxloc.max(d_a[i], i); - - } - ); - - } - ); + }); + }); std::cout << "\tsum = " << sycl_sum << std::endl; std::cout << "\tmin = " << sycl_min << std::endl; std::cout << "\tmax = " << sycl_max << std::endl; std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , " - << sycl_minloc.getLoc() << std::endl; + << sycl_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , " - << sycl_maxloc.getLoc() << std::endl; + << sycl_maxloc.getLoc() << std::endl; device_res.deallocate(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // host_res.deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/launch_flatten.cpp b/examples/launch_flatten.cpp index 2a3d92ad84..b79cc249a4 100644 --- a/examples/launch_flatten.cpp +++ b/examples/launch_flatten.cpp @@ -35,15 +35,17 @@ #if defined(RAJA_ENABLE_CUDA) using device_launch = RAJA::LaunchPolicy>; -using device_inner_pol0 = RAJA::LoopPolicy; -using device_inner_pol1 = RAJA::LoopPolicy; -using device_flatten_pol = RAJA::LoopPolicy; +using device_inner_pol0 = RAJA::LoopPolicy; +using device_inner_pol1 = RAJA::LoopPolicy; +using device_flatten_pol = + RAJA::LoopPolicy; using reduce_policy = RAJA::cuda_reduce; #elif defined(RAJA_ENABLE_HIP) using device_launch = RAJA::LaunchPolicy>; -using device_inner_pol0 = RAJA::LoopPolicy; -using device_inner_pol1 = RAJA::LoopPolicy; -using device_flatten_pol = RAJA::LoopPolicy; +using device_inner_pol0 = RAJA::LoopPolicy; +using device_inner_pol1 = RAJA::LoopPolicy; +using device_flatten_pol = + RAJA::LoopPolicy; using reduce_policy = RAJA::hip_reduce; #endif @@ -54,7 +56,7 @@ using reduce_policy = RAJA::hip_reduce; using host_launch = RAJA::LaunchPolicy; using host_loop = RAJA::LoopPolicy; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) @@ -63,20 +65,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Problem size dimensions // constexpr int N = 4; - constexpr int NN = N*N; + constexpr int NN = N * N; // // Configure grid size // - RAJA::LaunchParams launch_params(RAJA::Teams(1), - RAJA::Threads(N, N)); + RAJA::LaunchParams launch_params(RAJA::Teams(1), RAJA::Threads(N, N)); // // Resource object for host, used to allocate memory // camp::resources::Host host_res; - int *h_A_ptr = host_res.allocate(NN); + int* h_A_ptr = host_res.allocate(NN); // // Resource object for device, used to allocate memory @@ -87,9 +88,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) camp::resources::Hip device_res; #endif - int *d_A_ptr = device_res.allocate(NN); + int* d_A_ptr = device_res.allocate(NN); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running device version of teams_flatten example ...\n"; @@ -97,27 +98,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_A_2DView(d_A_ptr, N, N); RAJA::View> d_A_1DView(d_A_ptr, NN); - RAJA::launch - (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) - { + RAJA::launch( + launch_params, [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::RangeSegment(0, N), [&](int j) { + RAJA::loop( + ctx, RAJA::RangeSegment(0, N), [&](int i) { + d_A_2DView(j, i) = i + j; + }); + }); - RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) { - d_A_2DView(j, i) = i + j; - }); - }); + ctx.teamSync(); - ctx.teamSync(); + // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying + // accumulating memory contents + RAJA::loop( + ctx, RAJA::RangeSegment(0, NN), [&](int i) { + device_kernel_sum += d_A_1DView(i); + }); + }); - // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying - // accumulating memory contents - RAJA::loop(ctx, RAJA::RangeSegment(0, NN), [&] (int i) { - device_kernel_sum += d_A_1DView(i); - }); - - }); - -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running host version of teams_flatten example ...\n"; @@ -125,29 +126,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> h_A_2DView(h_A_ptr, N, N); RAJA::View> h_A_1DView(h_A_ptr, NN); - RAJA::launch - (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) - { - - RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) { - h_A_2DView(j, i) = i + j; - }); - }); - - ctx.teamSync(); - - //As loops are dispatched as standard C loops we can revert to using - //a regular seq_exec policy - RAJA::loop(ctx, RAJA::RangeSegment(0, NN), [&] (int i) { - host_kernel_sum += h_A_1DView(i); - }); - - }); - - if ( device_kernel_sum.get() == host_kernel_sum.get() ) { + RAJA::launch( + launch_params, [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int j) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int i) { + h_A_2DView(j, i) = i + j; + }); + }); + + ctx.teamSync(); + + // As loops are dispatched as standard C loops we can revert to using + // a regular seq_exec policy + RAJA::loop(ctx, RAJA::RangeSegment(0, NN), [&](int i) { + host_kernel_sum += h_A_1DView(i); + }); + }); + + if (device_kernel_sum.get() == host_kernel_sum.get()) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } diff --git a/examples/launch_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp index 797c5ee7c5..7c00a71071 100644 --- a/examples/launch_matrix-multiply.cpp +++ b/examples/launch_matrix-multiply.cpp @@ -31,23 +31,22 @@ /* * Define number of threads in x and y dimensions in a RAJA thread team * or in a CUDA/HIP thread blocks -*/ + */ #define THREAD_SZ 16 /* * Define host/device launch policies */ -using launch_policy = RAJA::LaunchPolicy< - RAJA::seq_launch_t +using launch_policy = RAJA::LaunchPolicy + , + RAJA::cuda_launch_t #endif #if defined(RAJA_ENABLE_HIP) - , - RAJA::hip_launch_t + , + RAJA::hip_launch_t #endif - >; + >; using loop_policy = RAJA::seq_exec; @@ -77,39 +76,45 @@ using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy; */ using teams_x = RAJA::LoopPolicy; + >; using teams_y = RAJA::LoopPolicy; + >; using threads_x = RAJA::LoopPolicy; + >; using threads_y = RAJA::LoopPolicy; + >; using global_thread_x = RAJA::LoopPolicy; + >; using global_thread_y = RAJA::LoopPolicy; + >; // // Define dimensionality of matrices. @@ -134,9 +139,11 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; - if ( row < N && col < N ) { + if (row < N && col < N) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } @@ -147,8 +154,8 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) { - int Row = blockIdx.y*THREAD_SZ + threadIdx.y; - int Col = blockIdx.x*THREAD_SZ + threadIdx.x; + int Row = blockIdx.y * THREAD_SZ + threadIdx.y; + int Col = blockIdx.x * THREAD_SZ + threadIdx.x; __shared__ double As[THREAD_SZ][THREAD_SZ]; __shared__ double Bs[THREAD_SZ][THREAD_SZ]; @@ -156,15 +163,16 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) Cs[threadIdx.y][threadIdx.x] = 0.0; - for (int k = 0; k < (THREAD_SZ + N - 1)/THREAD_SZ; k++) { + for (int k = 0; k < (THREAD_SZ + N - 1) / THREAD_SZ; k++) + { - if ( static_cast(k*THREAD_SZ + threadIdx.x) < N && Row < N ) - As[threadIdx.y][threadIdx.x] = A[Row*N + k*THREAD_SZ + threadIdx.x]; + if (static_cast(k * THREAD_SZ + threadIdx.x) < N && Row < N) + As[threadIdx.y][threadIdx.x] = A[Row * N + k * THREAD_SZ + threadIdx.x]; else As[threadIdx.y][threadIdx.x] = 0.0; - if ( static_cast(k*THREAD_SZ + threadIdx.y) < N && Col < N) - Bs[threadIdx.y][threadIdx.x] = B[(k*THREAD_SZ + threadIdx.y)*N + Col]; + if (static_cast(k * THREAD_SZ + threadIdx.y) < N && Col < N) + Bs[threadIdx.y][threadIdx.x] = B[(k * THREAD_SZ + threadIdx.y) * N + Col]; else Bs[threadIdx.y][threadIdx.x] = 0.0; @@ -177,8 +185,8 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) } if (Row < N && Col < N) - C[((blockIdx.y * blockDim.y + threadIdx.y)*N) + - (blockIdx.x * blockDim.x)+ threadIdx.x] = Cs[threadIdx.y][threadIdx.x]; + C[((blockIdx.y * blockDim.y + threadIdx.y) * N) + + (blockIdx.x * blockDim.x) + threadIdx.x] = Cs[threadIdx.y][threadIdx.x]; } #endif @@ -186,7 +194,7 @@ __global__ void sharedMatMultKernel(int N, double* C, double* A, double* B) // Functions for checking results // template -void checkResult(T *C, int N); +void checkResult(T* C, int N); template void checkResult(RAJA::View> Cview, int N); @@ -195,68 +203,72 @@ void checkResult(RAJA::View> Cview, int N); // Functions for printing results // template -void printResult(T *C, int N); +void printResult(T* C, int N); template void printResult(RAJA::View> Cview, int N); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix multiplication example...\n"; -// -// Define num rows/cols in matrix and number of teams based on -// number of threads in a dimension. -// + // + // Define num rows/cols in matrix and number of teams based on + // number of threads in a dimension. + // const int N = 1000; - const int NTeams = (N - 1)/THREAD_SZ + 1; + const int NTeams = (N - 1) / THREAD_SZ + 1; -// -// Allocate and initialize matrix data. -// - double *A = memoryManager::allocate(N * N); - double *B = memoryManager::allocate(N * N); - double *C = memoryManager::allocate(N * N); - - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + // + // Allocate and initialize matrix data. + // + double* A = memoryManager::allocate(N * N); + double* B = memoryManager::allocate(N * N); + double* C = memoryManager::allocate(N * N); + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { A(row, col) = row; B(row, col) = col; } } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix multiplication...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_cstyle_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } C(row, col) = dot; - } } // _matmult_cstyle_end checkResult(C, N); -//printResult(C, N); + // printResult(C, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// We define RAJA range segments to define the ranges of -// row, column, and dot-product loops for RAJA variants -// + // + // We define RAJA range segments to define the ranges of + // row, column, and dot-product loops for RAJA variants + // // _matmult_ranges_start RAJA::RangeSegment row_range(0, N); RAJA::RangeSegment col_range(0, N); @@ -265,120 +277,120 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif // _matmult_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// For the RAJA implementations of matrix multiplication, we -// use RAJA 'View' objects to access the matrix data. A RAJA view -// holds a pointer to a data array and enables multi-dimensional indexing -// into that data, similar to the macros we defined above. -// + // + // For the RAJA implementations of matrix multiplication, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into that data, similar to the macros we defined above. + // // _matmult_views_start RAJA::View> Aview(A, N, N); RAJA::View> Bview(B, N, N); RAJA::View> Cview(C, N, N); // _matmult_views_end -//----------------------------------------------------------------------------// -// -// RAJA Team loops uses a RAJA::launch method to launch a kernel. -// These examples, illustrate the basic interface and mechanics. -// -// This is different than RAJA::forall and so a few points of exmplanation -// are in order: -// -// 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch) -// execution is chosen at run time and we support running on the host -// or device. -// -// 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP -// and considers programming using a group of threads in which we group into -// teams. Number of threads and teams are defined inside the Resources struct. -// -// 3) Launch context is used synchronize threads within a team, an example of this -// is presented further below. -// -// 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism can be -// expressed by mapping outer loops (up to 3) to gpu blocks (teams) and inner -// loops to threads in a block (team). -// + //----------------------------------------------------------------------------// + // + // RAJA Team loops uses a RAJA::launch method to launch a kernel. + // These examples, illustrate the basic interface and mechanics. + // + // This is different than RAJA::forall and so a few points of exmplanation + // are in order: + // + // 1) RAJA Team loops execute inside a RAJA execution space (RAJA::launch) + // execution is chosen at run time and we support running on the host + // or device. + // + // 2) RAJA Team loops follows the thread/block programming models of CUDA/HIP + // and considers programming using a group of threads in which we group + // into teams. Number of threads and teams are defined inside the Resources + // struct. + // + // 3) Launch context is used synchronize threads within a team, an example of + // this + // is presented further below. + // + // 4) Parallelism is expressed through RAJA loops. Hierarchical parallelism + // can be + // expressed by mapping outer loops (up to 3) to gpu blocks (teams) and + // inner loops to threads in a block (team). + // std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - //As a starting point we demonstrate assigning each dot product - //to a thread on a two dimensional compute grid. Rows are mapped - //to threads in the x dimension, while Cols are mapped to threads - //in the y dimension. On the host this mapping simplifies to executing - //two for loops. + // As a starting point we demonstrate assigning each dot product + // to a thread on a two dimensional compute grid. Rows are mapped + // to threads in the x dimension, while Cols are mapped to threads + // in the y dimension. On the host this mapping simplifies to executing + // two for loops. // _matmult_basickernel_start - RAJA::launch(RAJA::ExecPlace::HOST, - RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), - RAJA::Threads(THREAD_SZ,THREAD_SZ)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, col_range, [&] (int col) { - RAJA::loop(ctx, row_range, [&] (int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; + RAJA::launch( + RAJA::ExecPlace::HOST, + RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams), + RAJA::Threads(THREAD_SZ, THREAD_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, col_range, [&](int col) { + RAJA::loop(ctx, row_range, [&](int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); + }); }); - }); - - }); // _matmult_basickernel_end checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - //RAJA Team loops currently only support a pair of policies at a time. - //Switching between a sequential and OpenMP launch space requires - //recompiling execution policies. When running exclusively on the host - //the compute grid may be left uninitialized as loop methods get expanded to - //standard C style loops. + // RAJA Team loops currently only support a pair of policies at a time. + // Switching between a sequential and OpenMP launch space requires + // recompiling execution policies. When running exclusively on the host + // the compute grid may be left uninitialized as loop methods get expanded to + // standard C style loops. using omp_launch_policy = RAJA::LaunchPolicy; using omp_col_policy0 = RAJA::LoopPolicy; using omp_row_policy0 = RAJA::LoopPolicy; - RAJA::launch(RAJA::LaunchParams(), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, col_range, [&] (int col) { - RAJA::loop(ctx, row_range, [&] (int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; + RAJA::launch( + RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, col_range, [&](int col) { + RAJA::loop(ctx, row_range, [&](int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); + }); }); - }); - - }); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This example collapses the row and col loops in an OpenMP parallel region. @@ -387,33 +399,32 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using global_thread_xy = RAJA::LoopPolicy; - RAJA::launch(RAJA::ExecPlace::HOST, - RAJA::LaunchParams(), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::expt::loop(ctx, col_range, row_range, [&] (int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - - }); - - }); + RAJA::launch( + RAJA::ExecPlace::HOST, + RAJA::LaunchParams(), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::expt::loop( + ctx, col_range, row_range, [&](int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); + }); checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif // if RAJA_ENABLE_OPENMP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA mat-mult (RAJA-nested)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This example maps row indicies to RAJA teams (CUDA @@ -425,87 +436,86 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // - RAJA::launch(RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(N), - RAJA::Threads(N)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, col_range, [&] (int col) { - RAJA::loop(ctx, row_range, [&] (int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - }); - }); - - }); + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, col_range, [&](int col) { + RAJA::loop(ctx, row_range, [&](int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); + }); + }); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tiled mat-mult ...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This example takes the extents of the col and row loops and breaks // them down into `tiles`. Tile loops are used to generate RangeSegments of // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate - // across the work within each tile. On the device, tiles are typically assigned - // to teams, while RAJA loops are mapped to threads. + // across the work within each tile. On the device, tiles are typically + // assigned to teams, while RAJA loops are mapped to threads. // // The tiling capabilities in RAJA will also mask out of bounds iterations. // - RAJA::launch(RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), - RAJA::Threads(THREAD_SZ,THREAD_SZ)), + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams), + RAJA::Threads(THREAD_SZ, THREAD_SZ)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::tile - (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) { - RAJA::tile - (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) { - - RAJA::loop(ctx, row_tile, [&] (int col) { - RAJA::loop(ctx, col_tile, [&] (int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - + RAJA::tile( + ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& row_tile) { + RAJA::tile( + ctx, + THREAD_SZ, + col_range, + [&](RAJA::RangeSegment const& col_tile) { + RAJA::loop(ctx, row_tile, [&](int col) { + RAJA::loop(ctx, col_tile, [&](int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); + }); }); - }); }); - }); - }); + }); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); #endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) - double *d_A = memoryManager::allocate_gpu(N * N); - double *d_B = memoryManager::allocate_gpu(N * N); - double *d_C = memoryManager::allocate_gpu(N * N); + double* d_A = memoryManager::allocate_gpu(N * N); + double* d_B = memoryManager::allocate_gpu(N * N); + double* d_C = memoryManager::allocate_gpu(N * N); std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); RAJA::View> d_Aview(d_A, N, N); RAJA::View> d_Bview(d_B, N, N); @@ -521,74 +531,73 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // - RAJA::launch(RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(N), - RAJA::Threads(N)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, col_range, [&] (int col) { - RAJA::loop(ctx, row_range, [&] (int row) { - + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, col_range, [&](int col) { + RAJA::loop(ctx, row_range, [&](int row) { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += d_Aview(row, k) * d_Bview(k, col); } d_Cview(row, col) = dot; - + }); }); - }); - }); + }); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP tiled mat-mult ...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // // This example takes the extents of the col and row loops and breaks // them down into `tiles`. Tile loops are used to generate RangeSegments of // fixed size, THREAD_SZ in this case. RAJA loops are then used to iterate - // across the work within each tile. On the device tiles are typically assigned - // to teams, while RAJA loops are mapped to threads. + // across the work within each tile. On the device tiles are typically + // assigned to teams, while RAJA loops are mapped to threads. // // The tiling capabilities in RAJA will also mask out of bounds iterations. // - RAJA::launch(RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), - RAJA::Threads(THREAD_SZ,THREAD_SZ)), + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams), + RAJA::Threads(THREAD_SZ, THREAD_SZ)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::tile - (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) { - RAJA::tile - (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) { - - RAJA::loop(ctx, row_tile, [&] (int col) { - RAJA::loop(ctx, col_tile, [&] (int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - + RAJA::tile( + ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& row_tile) { + RAJA::tile( + ctx, + THREAD_SZ, + col_range, + [&](RAJA::RangeSegment const& col_tile) { + RAJA::loop(ctx, row_tile, [&](int col) { + RAJA::loop(ctx, col_tile, [&](int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); + }); }); - }); }); - }); - }); + }); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif // if RAJA_ENABLE_HIP //----------------------------------------------------------------------------// @@ -596,9 +605,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running CUDA tiled mat-mult with shared memory ...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - using seq_loop = RAJA::LoopPolicy; + using seq_loop = RAJA::LoopPolicy; // // This example builds on the RAJA tiling capabilies presented earlier @@ -610,85 +619,99 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This example also uses the teamSync() method in the launch context // to add a barrier ensuring all threads have loaded/read from shared memory // - RAJA::launch(RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), - RAJA::Threads(THREAD_SZ,THREAD_SZ)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - // - // Loop over teams - // - RAJA::tile - (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &y_tile) { - RAJA::tile - (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &x_tile) { - - RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ]; - RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ]; - RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ]; - - RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { - Cs[ty][tx] = 0.0; - }); - }); - - RAJA::tile - (ctx, THREAD_SZ, dot_range, [&] (RAJA::RangeSegment const &k_tile) { - - RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::loop_icount(ctx, k_tile, [&](int k_id, int tx) { - As[ty][tx] = Aview(row,k_id); - }); - }); - - RAJA::loop_icount(ctx, k_tile, [&](int k_id, int ty) { - RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { - Bs[ty][tx] = Bview(k_id,col); - }); - }); - - ctx.teamSync(); - - RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { - - RAJA::loop_icount(ctx, k_tile, [&] (int gid, int e) { - Cs[ty][tx] += As[ty][e] * Bs[e][tx]; - }); - - }); - }); - - ctx.teamSync(); - - }); // slide across matrix - - RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { - Cview(col,row) = Cs[ty][tx]; - }); - }); - }); - }); - }); // kernel + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(NTeams, NTeams), + RAJA::Threads(THREAD_SZ, THREAD_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + // + // Loop over teams + // + RAJA::tile( + ctx, THREAD_SZ, row_range, [&](RAJA::RangeSegment const& y_tile) { + RAJA::tile( + ctx, + THREAD_SZ, + col_range, + [&](RAJA::RangeSegment const& x_tile) { + RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ]; + RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ]; + RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ]; + + RAJA::loop_icount( + ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, x_tile, [&](int col, int tx) { + Cs[ty][tx] = 0.0; + }); + }); + + RAJA::tile( + ctx, + THREAD_SZ, + dot_range, + [&](RAJA::RangeSegment const& k_tile) { + RAJA::loop_icount( + ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, k_tile, [&](int k_id, int tx) { + As[ty][tx] = Aview(row, k_id); + }); + }); + + RAJA::loop_icount( + ctx, k_tile, [&](int k_id, int ty) { + RAJA::loop_icount( + ctx, x_tile, [&](int col, int tx) { + Bs[ty][tx] = Bview(k_id, col); + }); + }); + + ctx.teamSync(); + + RAJA::loop_icount( + ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, x_tile, [&](int col, int tx) { + RAJA::loop_icount( + ctx, k_tile, [&](int gid, int e) { + Cs[ty][tx] += As[ty][e] * Bs[e][tx]; + }); + }); + }); + + ctx.teamSync(); + }); // slide across matrix + + RAJA::loop_icount( + ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, x_tile, [&](int col, int tx) { + Cview(col, row) = Cs[ty][tx]; + }); + }); + }); + }); + }); // kernel checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // Define thread block dimensions dim3 blockdim(THREAD_SZ, THREAD_SZ); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); // Launch CUDA kernel defined near the top of this file. matMultKernel<<>>(N, C, A, B); @@ -697,20 +720,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Cview, N); - std::cout << "\n Running CUDA tiled mat-mult with shared memory (no RAJA)...\n"; + std::cout << "\n Running CUDA tiled mat-mult with shared memory (no " + "RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); sharedMatMultKernel<<>>(N, C, A, B); cudaDeviceSynchronize(); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); #endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -719,47 +743,58 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define thread block dimensions dim3 blockdim(THREAD_SZ, THREAD_SZ); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // Launch HIP kernel defined near the top of this file. - hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); + hipLaunchKernelGGL( + (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); hipDeviceSynchronize(); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); - std::cout << "\n Running HIP tiled mat-mult with shared memory (no RAJA)...\n"; + std::cout << "\n Running HIP tiled mat-mult with shared memory (no " + "RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // Launch HIP kernel defined near the top of this file. - hipLaunchKernelGGL((sharedMatMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); + hipLaunchKernelGGL((sharedMatMultKernel), + dim3(griddim), + dim3(blockdim), + 0, + 0, + N, + d_C, + d_A, + d_B); hipDeviceSynchronize(); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); memoryManager::deallocate_gpu(d_A); memoryManager::deallocate_gpu(d_B); memoryManager::deallocate_gpu(d_C); #endif // if RAJA_ENABLE_HIP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); @@ -776,16 +811,22 @@ template void checkResult(T* C, int N) { bool match = true; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - if ( std::abs( C(row, col) - row * col * N ) > 10e-12 ) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + if (std::abs(C(row, col) - row * col * N) > 10e-12) + { match = false; } } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -794,16 +835,22 @@ template void checkResult(RAJA::View> Cview, int N) { bool match = true; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - if ( std::abs( Cview(row, col) - row * col * N ) > 10e-12 ) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + if (std::abs(Cview(row, col) - row * col * N) > 10e-12) + { match = false; } } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -815,10 +862,12 @@ template void printResult(T* C, int N) { std::cout << std::endl; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - std::cout << "C(" << row << "," << col << ") = " - << C(row, col) << std::endl; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + std::cout << "C(" << row << "," << col << ") = " << C(row, col) + << std::endl; } } std::cout << std::endl; @@ -828,10 +877,12 @@ template void printResult(RAJA::View> Cview, int N) { std::cout << std::endl; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - std::cout << "C(" << row << "," << col << ") = " - << Cview(row, col) << std::endl; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + std::cout << "C(" << row << "," << col << ") = " << Cview(row, col) + << std::endl; } } std::cout << std::endl; diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp index 24e313e649..c5de9b0e30 100644 --- a/examples/launch_reductions.cpp +++ b/examples/launch_reductions.cpp @@ -45,15 +45,17 @@ using device_loop = RAJA::hip_global_thread_x; using launch_policy = RAJA::LaunchPolicy; + >; using loop_pol = RAJA::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) using reduce_policy = RAJA::cuda_reduce; @@ -66,11 +68,13 @@ using reduce_policy = RAJA::seq_reduce; #endif -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions " + "device"); } // @@ -79,39 +83,51 @@ int main(int argc, char *argv[]) // Example usage ./launch_reductions host or ./launch_reductions device // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions " + "device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Launch reductions example on the host \n"); } - if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Launch reductions example on the device \n"); } + if (exec_space.compare("host") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::HOST; + printf("Running RAJA-Launch reductions example on the host \n"); + } + if (exec_space.compare("device") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; + printf("Running RAJA-Launch reductions example on the device \n"); + } // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // const int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // const int minloc_ref = N / 2; a[minloc_ref] = -100; @@ -119,70 +135,69 @@ int main(int argc, char *argv[]) a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // -// -// Define index range for iterating over a elements in all examples -// + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::RangeSegment arange(0, N); // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// RAJA::ReduceSum kernel_sum(0); - RAJA::ReduceMin kernel_min(std::numeric_limits::max()); - RAJA::ReduceMax kernel_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc kernel_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc kernel_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMin kernel_min( + std::numeric_limits::max()); + RAJA::ReduceMax kernel_max( + std::numeric_limits::min()); + RAJA::ReduceMinLoc kernel_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc kernel_maxloc( + std::numeric_limits::min(), -1); const int TEAM_SZ = 256; - const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ); - - RAJA::launch - (select_cpu_or_gpu, - RAJA::LaunchParams(RAJA::Teams(GRID_SZ), - RAJA::Threads(TEAM_SZ)), - "Launch Reductions", - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) - { - - RAJA::loop(ctx, arange, [&] (int i) { - - kernel_sum += a[i]; - - kernel_min.min(a[i]); - kernel_max.max(a[i]); - - kernel_minloc.minloc(a[i], i); - kernel_maxloc.maxloc(a[i], i); - }); - - }); + const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ); + + RAJA::launch( + select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)), + "Launch Reductions", + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, arange, [&](int i) { + kernel_sum += a[i]; + + kernel_min.min(a[i]); + kernel_max.max(a[i]); + + kernel_minloc.minloc(a[i], i); + kernel_maxloc.maxloc(a[i], i); + }); + }); std::cout << "\tsum = " << kernel_sum.get() << std::endl; std::cout << "\tmin = " << kernel_min.get() << std::endl; std::cout << "\tmax = " << kernel_max.get() << std::endl; std::cout << "\tmin, loc = " << kernel_minloc.get() << " , " - << kernel_minloc.getLoc() << std::endl; + << kernel_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , " - << kernel_maxloc.getLoc() << std::endl; + << kernel_maxloc.getLoc() << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp index 62d3d6e3e7..6f68615a45 100644 --- a/examples/memoryManager.hpp +++ b/examples/memoryManager.hpp @@ -28,20 +28,20 @@ namespace memoryManager { #if defined(RAJA_ENABLE_SYCL) - static camp::resources::Resource* sycl_res; +static camp::resources::Resource* sycl_res; #endif template -T *allocate(RAJA::Index_type size) +T* allocate(RAJA::Index_type size) { - T *ptr; + T* ptr; #if defined(RAJA_ENABLE_CUDA) cudaErrchk( - cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); + cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); + hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) - ptr = sycl_res->allocate(size, camp::resources::MemoryAccess::Managed); + ptr = sycl_res->allocate(size, camp::resources::MemoryAccess::Managed); #else ptr = new T[size]; #endif @@ -49,9 +49,10 @@ T *allocate(RAJA::Index_type size) } template -void deallocate(T *&ptr) +void deallocate(T*& ptr) { - if (ptr) { + if (ptr) + { #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaFree(ptr)); #elif defined(RAJA_ENABLE_HIP) @@ -65,37 +66,39 @@ void deallocate(T *&ptr) } } -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) - template - T *allocate_gpu(RAJA::Index_type size) - { - T *ptr; +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) +template +T* allocate_gpu(RAJA::Index_type size) +{ + T* ptr; #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size)); + cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); + hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) - auto qu = sycl_res->get().get_queue(); - ptr = cl::sycl::malloc_device(size, *qu); + auto qu = sycl_res->get().get_queue(); + ptr = cl::sycl::malloc_device(size, *qu); #endif - return ptr; - } + return ptr; +} - template - void deallocate_gpu(T *&ptr) +template +void deallocate_gpu(T*& ptr) +{ + if (ptr) { - if (ptr) { #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaFree(ptr)); + cudaErrchk(cudaFree(ptr)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipFree(ptr)); + hipErrchk(hipFree(ptr)); #elif defined(RAJA_ENABLE_SYCL) sycl_res->deallocate(ptr); #endif - ptr = nullptr; - } + ptr = nullptr; } +} #endif -}; // namespace memoryManager +}; // namespace memoryManager #endif diff --git a/examples/multiview.cpp b/examples/multiview.cpp index b765dc84d4..378abde700 100644 --- a/examples/multiview.cpp +++ b/examples/multiview.cpp @@ -15,12 +15,12 @@ * A RAJA::MultiView object wraps an array-of-pointers, * or a pointer-to-pointers, whereas a RAJA::View wraps a single * pointer or array. This allows a single RAJA::Layout to be applied to - * multiple arrays internal to the MultiView, allowing multiple arrays to share indexing - * arithmetic when their access patterns are the same. - * + * multiple arrays internal to the MultiView, allowing multiple arrays to share + * indexing arithmetic when their access patterns are the same. + * * The instantiation of a MultiView works exactly like a standard View, - * except that it takes an array-of-pointers. In the following example, a MultiView - * applies a 1-D layout of length 4 to 2 internal arrays in myarr: + * except that it takes an array-of-pointers. In the following example, a + * MultiView applies a 1-D layout of length 4 to 2 internal arrays in myarr: * * // Arrays of the same size, which will become internal to the MultiView. * int a1[4] = {5,6,7,8}; @@ -31,30 +31,35 @@ * myarr[0] = a1; * myarr[1] = a2; * - * // This MultiView applies a 1-D layout of length 4 to each internal array in myarr. - * RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); - * - * The default MultiView accesses internal arrays via the 0th index of the MultiView: - * - * MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - * MView( 1, 2 ); // accesses 2nd index of the 1st internal array a2, returns value of 10 - * + * // This MultiView applies a 1-D layout of length 4 to each internal array + * in myarr. RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); + * + * The default MultiView accesses internal arrays via the 0th index of the + * MultiView: + * + * MView( 0, 4 ); // accesses the 4th index of the 0th internal array a1, + * returns value of 8 MView( 1, 2 ); // accesses 2nd index of the 1st internal + * array a2, returns value of 10 + * * The index into the array-of-pointers can be moved to different - * indices of the MultiView () access operator, rather than the default 0th index. By - * passing a third template parameter to the MultiView constructor, the internal array index - * and the integer indicating which array to access can be reversed: + * indices of the MultiView () access operator, rather than the default 0th + * index. By passing a third template parameter to the MultiView constructor, + * the internal array index and the integer indicating which array to access can + * be reversed: * * // MultiView with array-of-pointers index in 1st position * RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4); * - * MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - * MView1( 2, 1 ); // accesses 2nd index of the 1st internal array a2, returns value of 10 - * - * As the number of Layout dimensions increases, the index into the array-of-pointers can be - * moved to more distinct locations in the MultiView () access operator. Here is an example - * which compares the accesses of a 2-D layout on a normal RAJA::View with a RAJA::MultiView - * with the array-of-pointers index set to the 2nd position: - * + * MView1( 4, 0 ); // accesses the 4th index of the 0th internal array a1, + * returns value of 8 MView1( 2, 1 ); // accesses 2nd index of the 1st internal + * array a2, returns value of 10 + * + * As the number of Layout dimensions increases, the index into the + * array-of-pointers can be moved to more distinct locations in the MultiView () + * access operator. Here is an example which compares the accesses of a 2-D + * layout on a normal RAJA::View with a RAJA::MultiView with the + * array-of-pointers index set to the 2nd position: + * * RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2); * * normalView( 2, 1 ); // accesses 3rd index of the a1 array, value = 7 @@ -62,8 +67,9 @@ * // MultiView with array-of-pointers index in 2nd position * RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2); * - * MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, returns value of 7 (same as normaView(2,1)) - * MView2( 2, 1, 1 ); // accesses the 3rd index of the 1st internal array a2, returns value of 11 + * MView2( 2, 1, 0 ); // accesses the 3rd index of the 0th internal array a1, + * returns value of 7 (same as normaView(2,1)) MView2( 2, 1, 1 ); // accesses + * the 3rd index of the 1st internal array a2, returns value of 11 * * The following code demonstrates 2 aspects of RAJA::MultiView usage: * - Basic usage @@ -75,53 +81,62 @@ void docs_example() // temporaries int t1, t2, t3, t4; - printf( "MultiView Example from RAJA Documentation:\n" ); + printf("MultiView Example from RAJA Documentation:\n"); // _multiview_example_1Dinit_start // Arrays of the same size, which will become internal to the MultiView. - int a1[4] = {5,6,7,8}; - int a2[4] = {9,10,11,12}; + int a1[4] = {5, 6, 7, 8}; + int a2[4] = {9, 10, 11, 12}; // Array-of-pointers which will be passed into MultiView. - int * myarr[2]; + int* myarr[2]; myarr[0] = a1; myarr[1] = a2; - // This MultiView applies a 1-D layout of length 4 to each internal array in myarr. - RAJA::MultiView< int, RAJA::Layout<1> > MView(myarr, 4); + // This MultiView applies a 1-D layout of length 4 to each internal array in + // myarr. + RAJA::MultiView> MView(myarr, 4); // _multiview_example_1Dinit_end // _multiview_example_1Daccess_start - t1 = MView( 0, 3 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - t2 = MView( 1, 2 ); // accesses 3rd index of the 1st internal array a2, returns value of 11 + t1 = MView(0, 3); // accesses the 4th index of the 0th internal array a1, + // returns value of 8 + t2 = MView(1, 2); // accesses 3rd index of the 1st internal array a2, returns + // value of 11 // _multiview_example_1Daccess_end // _multiview_example_1Daopindex_start // MultiView with array-of-pointers index in 1st position. - RAJA::MultiView< int, RAJA::Layout<1>, 1 > MView1(myarr, 4); + RAJA::MultiView, 1> MView1(myarr, 4); - t3 = MView1( 3, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 - t4 = MView1( 2, 1 ); // accesses 3rd index of the 1st internal array a2, returns value of 11 + t3 = MView1(3, 0); // accesses the 4th index of the 0th internal array a1, + // returns value of 8 + t4 = MView1(2, 1); // accesses 3rd index of the 1st internal array a2, returns + // value of 11 // _multiview_example_1Daopindex_end - printf( "Comparison of default MultiView with another MultiView that has the array-of-pointers index in the 1st position of the () accessor:\n" ); - printf( "MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3 ); - printf( "MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4 ); + printf("Comparison of default MultiView with another MultiView that has the " + "array-of-pointers index in the 1st position of the () accessor:\n"); + printf("MView( 0, 3 ) = %i, MView1( 3, 0 ) = %i\n", t1, t3); + printf("MView( 1, 2 ) = %i, MView1( 2, 1 ) = %i\n", t2, t4); // _multiview_example_2Daopindex_start - RAJA::View< int, RAJA::Layout<2> > normalView(a1, 2, 2); + RAJA::View> normalView(a1, 2, 2); - t1 = normalView( 1, 1 ); // accesses 4th index of the a1 array, value = 8 + t1 = normalView(1, 1); // accesses 4th index of the a1 array, value = 8 // MultiView with array-of-pointers index in 2nd position - RAJA::MultiView< int, RAJA::Layout<2>, 2 > MView2(myarr, 2, 2); + RAJA::MultiView, 2> MView2(myarr, 2, 2); - t2 = MView2( 1, 1, 0 ); // accesses the 4th index of the 0th internal array a1, returns value of 8 (same as normalView(1,1)) - t3 = MView2( 0, 0, 1 ); // accesses the 1st index of the 1st internal array a2, returns value of 9 + t2 = MView2(1, 1, 0); // accesses the 4th index of the 0th internal array a1, + // returns value of 8 (same as normalView(1,1)) + t3 = MView2(0, 0, 1); // accesses the 1st index of the 1st internal array a2, + // returns value of 9 // _multiview_example_2Daopindex_end - printf( "Comparison of 2D normal View with 2D MultiView that has the array-of-pointers index in the 2nd position of the () accessor:\n" ); - printf( "normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2 ); + printf("Comparison of 2D normal View with 2D MultiView that has the " + "array-of-pointers index in the 2nd position of the () accessor:\n"); + printf("normalView( 1, 1 ) = %i, MView2( 1, 1, 0 ) = %i\n", t1, t2); } int main() @@ -129,11 +144,11 @@ int main() docs_example(); constexpr int N = 12; - int * myarr[2]; // two 3x4 arrays + int* myarr[2]; // two 3x4 arrays int arr1[N]; int arr2[N]; - for ( int ii = 0; ii < N; ++ii ) + for (int ii = 0; ii < N; ++ii) { arr1[ii] = 100 + ii; arr2[ii] = 200 + ii; @@ -143,39 +158,47 @@ int main() myarr[1] = arr2; // 4x3 layout - std::array perm { {0, 1} }; - RAJA::Layout<2> layout = RAJA::make_permuted_layout( - { {4, 3} }, perm - ); + std::array perm{{0, 1}}; + RAJA::Layout<2> layout = RAJA::make_permuted_layout({{4, 3}}, perm); // Basic MultiView usage // Default usage: no specified array-of-pointers index moving // 0th position is used as the array-of-pointers index - RAJA::MultiView> arrView(myarr, layout); + RAJA::MultiView> arrView(myarr, + layout); // Moved array-of-pointers index MultiView usage // Add an array-of-pointers index specifier constexpr int aopidx = 1; - RAJA::MultiView, aopidx> arrViewMov(myarr, layout); + RAJA::MultiView, aopidx> arrViewMov( + myarr, layout); // Comparing values of both views - printf ( "Comparing values of both default and 1-index-ed MultiViews:\n" ); - for ( int pp = 0; pp < 2; ++pp ) + printf("Comparing values of both default and 1-index-ed MultiViews:\n"); + for (int pp = 0; pp < 2; ++pp) { - for ( int kk = 0; kk < 4; ++kk ) + for (int kk = 0; kk < 4; ++kk) { - for ( int jj = 0; jj < 3; ++jj ) + for (int jj = 0; jj < 3; ++jj) { - printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) ); + printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", + pp, + kk, + jj, + arrView(pp, kk, jj), + kk, + pp, + jj, + arrViewMov(kk, pp, jj)); } } } // switch values - printf ( "Switching values\n" ); - for ( int kk = 0; kk < 4; ++kk ) + printf("Switching values\n"); + for (int kk = 0; kk < 4; ++kk) { - for ( int jj = 0; jj < 3; ++jj ) + for (int jj = 0; jj < 3; ++jj) { int temp = arrView(0, kk, jj); arrView(0, kk, jj) = arrView(1, kk, jj); @@ -184,14 +207,23 @@ int main() } // Comparing switched values of both views - printf ( "Comparing switched values of both default and 1-index-ed MultiViews:\n" ); - for ( int pp = 0; pp < 2; ++pp ) + printf("Comparing switched values of both default and 1-index-ed " + "MultiViews:\n"); + for (int pp = 0; pp < 2; ++pp) { - for ( int kk = 0; kk < 4; ++kk ) + for (int kk = 0; kk < 4; ++kk) { - for ( int jj = 0; jj < 3; ++jj ) + for (int jj = 0; jj < 3; ++jj) { - printf ( "arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", pp, kk, jj, arrView(pp, kk, jj), kk, pp, jj, arrViewMov(kk, pp, jj) ); + printf("arr(%i, %i, %i) %d == arrmov(%i, %i, %i) %d\n", + pp, + kk, + jj, + arrView(pp, kk, jj), + kk, + pp, + jj, + arrViewMov(kk, pp, jj)); } } } diff --git a/examples/omp-target-kernel.cpp b/examples/omp-target-kernel.cpp index ce425e07a6..1d101f1ca5 100644 --- a/examples/omp-target-kernel.cpp +++ b/examples/omp-target-kernel.cpp @@ -10,35 +10,33 @@ using namespace RAJA; using namespace RAJA::statement; -int main(int /*argc*/, char** /*argv[]*/) { +int main(int /*argc*/, char** /*argv[]*/) +{ // using Pol = KernelPolicy< // For<1, RAJA::seq_exec>, // For<0, RAJA::omp_target_parallel_for_exec<1>, Lambda<0> > // >; using Pol = KernelPolicy< - Collapse, Lambda<0> > >; + Collapse, Lambda<0>>>; - double* array = new double[25*25]; + double* array = new double[25 * 25]; -#pragma omp target enter data map(to: array[0:25*25]) +#pragma omp target enter data map(to : array [0:25 * 25]) #pragma omp target data use_device_ptr(array) #if 1 RAJA::kernel( - RAJA::make_tuple( - RAJA::RangeSegment(0,25), - RAJA::RangeSegment(0,25)), - [=] (int /*i*/, int /*j*/) { - //array[i + (25*j)] = i*j; - // int idx = i; - //array[0] = i*j; - }); + RAJA::make_tuple(RAJA::RangeSegment(0, 25), RAJA::RangeSegment(0, 25)), + [=](int /*i*/, int /*j*/) { + // array[i + (25*j)] = i*j; + // int idx = i; + // array[0] = i*j; + }); #else - RAJA::forall>( - RAJA::RangeSegment(0,25), - [=] (int i) { - // - }); + RAJA::forall>(RAJA::RangeSegment(0, 25), + [=](int i) { + // + }); #endif } diff --git a/examples/omp-target-ltimes.cpp b/examples/omp-target-ltimes.cpp index f51694b3af..c04939c26f 100644 --- a/examples/omp-target-ltimes.cpp +++ b/examples/omp-target-ltimes.cpp @@ -9,7 +9,6 @@ #include - #include "RAJA/RAJA.hpp" #include "RAJA/util/Timer.hpp" @@ -28,22 +27,25 @@ RAJA_INDEX_VALUE(IZone, "IZone"); void runLTimesRajaKernel(bool debug, - Index_type num_moments, - Index_type num_directions, - Index_type num_groups, - Index_type num_zones) + Index_type num_moments, + Index_type num_directions, + Index_type num_groups, + Index_type num_zones) { - using namespace RAJA::statement; + using namespace RAJA::statement; // psi[direction, group, zone] - using PsiView = RAJA::TypedView, IDirection, IGroup, IZone>; + using PsiView = RAJA:: + TypedView, IDirection, IGroup, IZone>; // phi[moment, group, zone] - using PhiView = RAJA::TypedView, IMoment, IGroup, IZone>; + using PhiView = + RAJA::TypedView, IMoment, IGroup, IZone>; // ell[moment, direction] - using EllView = RAJA::TypedView, IMoment, IDirection>; + using EllView = + RAJA::TypedView, IMoment, IDirection>; // allocate data @@ -54,16 +56,19 @@ void runLTimesRajaKernel(bool debug, // randomize data - for (size_t i = 0; i < ell_data.size(); ++i) { - ell_data[i] = i; //drand48(); + for (size_t i = 0; i < ell_data.size(); ++i) + { + ell_data[i] = i; // drand48(); } - for (size_t i = 0; i < psi_data.size(); ++i) { - psi_data[i] = 2*i; //drand48(); + for (size_t i = 0; i < psi_data.size(); ++i) + { + psi_data[i] = 2 * i; // drand48(); } - for (size_t i = 0; i < phi_data.size(); ++i) { - phi_data[i] = 0; //drand48(); + for (size_t i = 0; i < phi_data.size(); ++i) + { + phi_data[i] = 0; // drand48(); } int hid = omp_get_initial_device(); @@ -71,58 +76,50 @@ void runLTimesRajaKernel(bool debug, // create device memory double *d_ell, *d_phi, *d_psi; - d_ell = static_cast(omp_target_alloc(sizeof(double) * ell_data.size(), did)); - d_phi = static_cast(omp_target_alloc(sizeof(double) * phi_data.size(), did)); - d_psi = static_cast(omp_target_alloc(sizeof(double) * psi_data.size(), did)); + d_ell = static_cast( + omp_target_alloc(sizeof(double) * ell_data.size(), did)); + d_phi = static_cast( + omp_target_alloc(sizeof(double) * phi_data.size(), did)); + d_psi = static_cast( + omp_target_alloc(sizeof(double) * psi_data.size(), did)); // Copy to device omp_target_memcpy( - &ell_data[0], - d_ell, - sizeof(double) * ell_data.size(), - 0,0, hid, did); + &ell_data[0], d_ell, sizeof(double) * ell_data.size(), 0, 0, hid, did); omp_target_memcpy( - &phi_data[0], - d_phi, - sizeof(double) * phi_data.size(), - 0,0,hid,did); + &phi_data[0], d_phi, sizeof(double) * phi_data.size(), 0, 0, hid, did); omp_target_memcpy( - &psi_data[0], - d_psi, - sizeof(double) * psi_data.size(), - 0,0,hid,did); + &psi_data[0], d_psi, sizeof(double) * psi_data.size(), 0, 0, hid, did); // create views on data - std::array ell_perm {{0, 1}}; - EllView ell( - d_ell, - make_permuted_layout({{num_moments, num_directions}}, ell_perm)); + std::array ell_perm{{0, 1}}; + EllView ell(d_ell, + make_permuted_layout({{num_moments, num_directions}}, ell_perm)); - std::array psi_perm {{0, 1, 2}}; - PsiView psi( - d_psi, - make_permuted_layout({{num_directions, num_groups, num_zones}}, psi_perm)); + std::array psi_perm{{0, 1, 2}}; + PsiView psi(d_psi, + make_permuted_layout({{num_directions, num_groups, num_zones}}, + psi_perm)); - std::array phi_perm {{0, 1, 2}}; + std::array phi_perm{{0, 1, 2}}; PhiView phi( d_phi, make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm)); - - using Pol = RAJA::KernelPolicy< - Collapse, - For<3, RAJA::seq_exec, Lambda<0>>>>; + using Pol = RAJA::KernelPolicy, + For<3, RAJA::seq_exec, Lambda<0>>>>; RAJA::Timer timer; timer.start(); - auto segments = RAJA::make_tuple(TypedRangeSegment(0, num_moments), - TypedRangeSegment(0, num_directions), - TypedRangeSegment(0, num_groups), - TypedRangeSegment(0, num_zones)); + auto segments = + RAJA::make_tuple(TypedRangeSegment(0, num_moments), + TypedRangeSegment(0, num_directions), + TypedRangeSegment(0, num_groups), + TypedRangeSegment(0, num_zones)); kernel( @@ -130,56 +127,62 @@ void runLTimesRajaKernel(bool debug, segments, // Lambda_CalcPhi - [=] (IMoment m, IDirection d, IGroup g, IZone z) { + [=](IMoment m, IDirection d, IGroup g, IZone z) { phi(m, g, z) += ell(m, d) * psi(d, g, z); }); - timer.stop(); - printf("LTimes took %lf seconds using RAJA::kernel\n", - timer.elapsed()); + printf("LTimes took %lf seconds using RAJA::kernel\n", timer.elapsed()); // Check correctness - if(debug){ + if (debug) + { size_t errors = 0; double total_error = 0.; - for (IZone z(0); z < num_zones; ++z) { - for (IGroup g(0); g < num_groups; ++g) { - for (IMoment m(0); m < num_moments; ++m) { + for (IZone z(0); z < num_zones; ++z) + { + for (IGroup g(0); g < num_groups; ++g) + { + for (IMoment m(0); m < num_moments; ++m) + { double total = 0.0; - for (IDirection d(0); d < num_directions; ++d) { + for (IDirection d(0); d < num_directions; ++d) + { double val = ell(m, d) * psi(d, g, z); total += val; } - if(std::abs(total-phi(m,g,z)) > 1e-9){ - ++ errors; + if (std::abs(total - phi(m, g, z)) > 1e-9) + { + ++errors; } - total_error += std::abs(total-phi(m,g,z)); + total_error += std::abs(total - phi(m, g, z)); } } } - if(errors == 0){ + if (errors == 0) + { printf(" -- no errors (%e)\n", total_error); } - else{ + else + { printf(" -- failed : %ld errors\n", (long)errors); } } - } -int main(){ +int main() +{ bool debug = true; int m = 25; int d = 80; int g = 32; - int z = 32*1024; + int z = 32 * 1024; printf("m=%d, d=%d, g=%d, z=%d\n", m, d, g, z); @@ -187,5 +190,3 @@ int main(){ return 0; } - - diff --git a/examples/pi-reduce_vs_atomic.cpp b/examples/pi-reduce_vs_atomic.cpp index ea0c18611f..fae7c47c19 100644 --- a/examples/pi-reduce_vs_atomic.cpp +++ b/examples/pi-reduce_vs_atomic.cpp @@ -45,55 +45,54 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA pi example...\n"; -// -// Define RangeSegment to enumerate "bins" and "bin step" size used in -// Riemann integral sum to approximate pi, -// and memory location for atomic add operation. -// + // + // Define RangeSegment to enumerate "bins" and "bin step" size used in + // Riemann integral sum to approximate pi, + // and memory location for atomic add operation. + // const int num_bins = 512 * 512; - const double dx = 1.0 / double(num_bins); + const double dx = 1.0 / double(num_bins); - RAJA::RangeSegment bins(0, num_bins); + RAJA::RangeSegment bins(0, num_bins); double* atomic_pi = memoryManager::allocate(1); -// Set precision for printing pi + // Set precision for printing pi int prec = 16; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential pi approximation...\n"; - + double c_pi = 0.0; - for (int i = 0; i < num_bins; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi += dx / (1.0 + x * x); + for (int i = 0; i < num_bins; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi += dx / (1.0 + x * x); } c_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential pi approximation (reduction)...\n"; - using EXEC_POL1 = RAJA::seq_exec; - using REDUCE_POL1 = RAJA::seq_reduce; + using EXEC_POL1 = RAJA::seq_exec; + using REDUCE_POL1 = RAJA::seq_reduce; RAJA::ReduceSum seq_pi(0.0); RAJA::forall(bins, [=](int i) { - double x = (double(i) + 0.5) * dx; - seq_pi += dx / (1.0 + x * x); + double x = (double(i) + 0.5) * dx; + seq_pi += dx / (1.0 + x * x); }); double seq_pi_val = seq_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << seq_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl; std::cout << "\n Running RAJA sequential pi approximation (atomic)...\n"; @@ -103,35 +102,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; RAJA::forall(bins, [=](int i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(atomic_pi, - dx / (1.0 + x * x)); + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP pi approximation (reduction)...\n"; - using EXEC_POL2 = RAJA::omp_parallel_for_exec; + using EXEC_POL2 = RAJA::omp_parallel_for_exec; using REDUCE_POL2 = RAJA::omp_reduce; RAJA::ReduceSum omp_pi(0.0); RAJA::forall(bins, [=](int i) { - double x = (double(i) + 0.5) * dx; - omp_pi += dx / (1.0 + x * x); + double x = (double(i) + 0.5) * dx; + omp_pi += dx / (1.0 + x * x); }); double omp_pi_val = omp_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << omp_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl; std::cout << "\n Running RAJA OpenMP pi approximation (atomic)...\n"; @@ -141,37 +137,34 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; RAJA::forall(bins, [=](int i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(atomic_pi, - dx / (1.0 + x * x)); + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA pi approximation (reduction)...\n"; - using EXEC_POL3 = RAJA::cuda_exec; + using EXEC_POL3 = RAJA::cuda_exec; using REDUCE_POL3 = RAJA::cuda_reduce; RAJA::ReduceSum cuda_pi(0.0); - RAJA::forall(bins, [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - cuda_pi += dx / (1.0 + x * x); + RAJA::forall(bins, [=] RAJA_DEVICE(int i) { + double x = (double(i) + 0.5) * dx; + cuda_pi += dx / (1.0 + x * x); }); double cuda_pi_val = cuda_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << cuda_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl; std::cout << "\n Running RAJA CUDA pi approximation (atomic)...\n"; @@ -180,63 +173,62 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) *atomic_pi = 0.0; - RAJA::forall(bins, [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); + RAJA::forall(bins, [=] RAJA_DEVICE(int i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(atomic_pi, dx / (1.0 + x * x)); }); *atomic_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP pi approximation (reduction)...\n"; - using EXEC_POL4 = RAJA::hip_exec; + using EXEC_POL4 = RAJA::hip_exec; using REDUCE_POL4 = RAJA::hip_reduce; RAJA::ReduceSum hip_pi(0.0); - RAJA::forall(bins, [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - hip_pi += dx / (1.0 + x * x); + RAJA::forall(bins, [=] RAJA_DEVICE(int i) { + double x = (double(i) + 0.5) * dx; + hip_pi += dx / (1.0 + x * x); }); double hip_pi_val = hip_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << hip_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << hip_pi_val << std::endl; std::cout << "\n Running RAJA HIP pi approximation (atomic)...\n"; *atomic_pi = 0; double* d_atomic_pi = memoryManager::allocate_gpu(1); - hipErrchk(hipMemcpy( d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( + d_atomic_pi, atomic_pi, 1 * sizeof(double), hipMemcpyHostToDevice)); using ATOMIC_POL4 = RAJA::hip_atomic; - RAJA::forall(bins, [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(d_atomic_pi, dx / (1.0 + x * x)); + RAJA::forall(bins, [=] RAJA_DEVICE(int i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(d_atomic_pi, dx / (1.0 + x * x)); }); - hipErrchk(hipMemcpy( atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost )); - *atomic_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << *atomic_pi << std::endl; + hipErrchk(hipMemcpy( + atomic_pi, d_atomic_pi, 1 * sizeof(double), hipMemcpyDeviceToHost)); + *atomic_pi *= 4.0; + std::cout << "\tpi = " << std::setprecision(prec) << *atomic_pi << std::endl; memoryManager::deallocate_gpu(d_atomic_pi); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(atomic_pi); std::cout << "\n DONE!...\n"; diff --git a/examples/plugin/counter-plugin.cpp b/examples/plugin/counter-plugin.cpp index 8134cd9b83..ece7814a71 100644 --- a/examples/plugin/counter-plugin.cpp +++ b/examples/plugin/counter-plugin.cpp @@ -10,45 +10,51 @@ #include -class CounterPlugin : - public RAJA::util::PluginStrategy +class CounterPlugin : public RAJA::util::PluginStrategy { - public: - void preCapture(const RAJA::util::PluginContext& p) override { - if (p.platform == RAJA::Platform::host) +public: + void preCapture(const RAJA::util::PluginContext& p) override + { + if (p.platform == RAJA::Platform::host) { - std::cout << " [CounterPlugin]: Capturing host kernel for the " << ++host_capture_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Capturing host kernel for the " + << ++host_capture_counter << " time!" << std::endl; } else { - std::cout << " [CounterPlugin]: Capturing device kernel for the " << ++device_capture_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Capturing device kernel for the " + << ++device_capture_counter << " time!" << std::endl; } } - void preLaunch(const RAJA::util::PluginContext& p) override { + void preLaunch(const RAJA::util::PluginContext& p) override + { if (p.platform == RAJA::Platform::host) { - std::cout << " [CounterPlugin]: Launching host kernel for the " << ++host_launch_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Launching host kernel for the " + << ++host_launch_counter << " time!" << std::endl; } else { - std::cout << " [CounterPlugin]: Launching device kernel for the " << ++device_launch_counter << " time!" << std::endl; + std::cout << " [CounterPlugin]: Launching device kernel for the " + << ++device_launch_counter << " time!" << std::endl; } } - private: - int host_capture_counter; - int device_capture_counter; - int host_launch_counter; - int device_launch_counter; +private: + int host_capture_counter; + int device_capture_counter; + int host_launch_counter; + int device_launch_counter; }; // Statically loading plugin. -static RAJA::util::PluginRegistry::add P("Counter", "Counts number of kernel launches."); +static RAJA::util::PluginRegistry::add P("Counter", + "Counts " + "number of " + "kernel " + "launches."); // Dynamically loading plugin. -extern "C" RAJA::util::PluginStrategy *getPlugin () -{ - return new CounterPlugin; -} +extern "C" RAJA::util::PluginStrategy* getPlugin() { return new CounterPlugin; } // _plugin_example_end diff --git a/examples/plugin/test-plugin-dynamic.cpp b/examples/plugin/test-plugin-dynamic.cpp index c9e574a99e..b73a13441f 100644 --- a/examples/plugin/test-plugin-dynamic.cpp +++ b/examples/plugin/test-plugin-dynamic.cpp @@ -8,15 +8,14 @@ #include "RAJA/RAJA.hpp" #include -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { RAJA::util::init_plugins("../lib/libtimer_plugin.so"); - double *a = new double[10]; + double* a = new double[10]; for (int i = 0; i < 4; i++) { - RAJA::forall(RAJA::RangeSegment(0, 10), [=](int i) { - a[i] = 0; - }); + RAJA::forall(RAJA::RangeSegment(0, 10), + [=](int i) { a[i] = 0; }); } } diff --git a/examples/plugin/test-plugin.cpp b/examples/plugin/test-plugin.cpp index b18233cb90..2164ae7df9 100644 --- a/examples/plugin/test-plugin.cpp +++ b/examples/plugin/test-plugin.cpp @@ -7,13 +7,13 @@ #include "RAJA/RAJA.hpp" -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { double* a = new double[10]; - for (int i = 0; i < 10; i++) { - RAJA::forall(RAJA::RangeSegment(0,10), [=] (int i) { - a[i] = 0; - }); + for (int i = 0; i < 10; i++) + { + RAJA::forall(RAJA::RangeSegment(0, 10), + [=](int i) { a[i] = 0; }); } } diff --git a/examples/plugin/timer-plugin.cpp b/examples/plugin/timer-plugin.cpp index 2619f9fcd9..cdb330a970 100644 --- a/examples/plugin/timer-plugin.cpp +++ b/examples/plugin/timer-plugin.cpp @@ -21,15 +21,19 @@ class TimerPlugin : public RAJA::util::PluginStrategy void postLaunch(const RAJA::util::PluginContext& p) override { end_time = std::chrono::steady_clock::now(); - double elapsedMs = std::chrono::duration(end_time - start_time).count(); + double elapsedMs = + std::chrono::duration(end_time - start_time) + .count(); if (p.platform == RAJA::Platform::host) { - printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", elapsedMs); + printf("[TimerPlugin]: Elapsed time of host kernel was %f ms\n", + elapsedMs); } else { - printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", elapsedMs); + printf("[TimerPlugin]: Elapsed time of device kernel was %f ms\n", + elapsedMs); } } @@ -39,10 +43,10 @@ class TimerPlugin : public RAJA::util::PluginStrategy }; // Dynamically loading plugin. -extern "C" RAJA::util::PluginStrategy *getPlugin() -{ - return new TimerPlugin; -} +extern "C" RAJA::util::PluginStrategy* getPlugin() { return new TimerPlugin; } // Statically loading plugin. -static RAJA::util::PluginRegistry::add P("Timer", "Prints elapsed time of kernel executions."); \ No newline at end of file +static RAJA::util::PluginRegistry::add P("Timer", + "Prints elapsed " + "time of kernel " + "executions."); \ No newline at end of file diff --git a/examples/raja-launch.cpp b/examples/raja-launch.cpp index b2642e16ff..89ed356f37 100644 --- a/examples/raja-launch.cpp +++ b/examples/raja-launch.cpp @@ -56,36 +56,36 @@ using launch_policy = RAJA::LaunchPolicy< */ using teams_x = RAJA::LoopPolicy< #if defined(RAJA_ENABLE_OPENMP) - RAJA::omp_parallel_for_exec + RAJA::omp_parallel_for_exec #else - RAJA::seq_exec + RAJA::seq_exec #endif #if defined(RAJA_ENABLE_CUDA) - , - RAJA::cuda_block_x_direct + , + RAJA::cuda_block_x_direct #endif #if defined(RAJA_ENABLE_HIP) - , - RAJA::hip_block_x_direct + , + RAJA::hip_block_x_direct #endif - >; + >; /* * Define thread policies. * Up to 3 dimension are supported: x,y,z */ using threads_x = RAJA::LoopPolicy; + >; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { // Resource object for host @@ -109,7 +109,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // RAJA teams may switch between host and device policies at run time. // The loop below will execute through the available backends. - for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) { + for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) + { auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place; @@ -117,12 +118,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int N_tri = 5; int* Ddat = nullptr; - if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { Ddat = host_res.allocate(N_tri * N_tri); } #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { Ddat = device_res.allocate(N_tri * N_tri); } #endif @@ -141,51 +144,56 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) * and is used to perform thread synchronizations within a team. */ - if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){ - std::cout << "\n Running upper triangular pattern example on the host...\n"; - } else { - std::cout << "\n Running upper triangular pattern example on the device...\n"; + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { + std::cout << "\n Running upper triangular pattern example on the " + "host...\n"; + } + else + { + std::cout << "\n Running upper triangular pattern example on the " + "device...\n"; } RAJA::View> D(Ddat, N_tri, N_tri); - RAJA::launch - (select_cpu_or_gpu, - RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::launch( + select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { + // Array shared within threads of the same team + RAJA_TEAM_SHARED int s_A[1]; - RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { - - // Array shared within threads of the same team - RAJA_TEAM_SHARED int s_A[1]; - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; - }); // loop c - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { - D(r, c) = r * N_tri + c; - printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); - }); // loop c + }); // loop c - }); // loop r + ctx.teamSync(); - }); // outer lambda + RAJA::loop( + ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { + D(r, c) = r * N_tri + c; + printf( + "r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); + }); // loop c + }); // loop r + }); // outer lambda - if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { host_res.deallocate(Ddat); } #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + { device_res.deallocate(Ddat); } #endif - } // Execution places loop + } // Execution places loop -} // Main +} // Main diff --git a/examples/red-black-gauss-seidel.cpp b/examples/red-black-gauss-seidel.cpp index cfe74dc58a..f22df3cac9 100644 --- a/examples/red-black-gauss-seidel.cpp +++ b/examples/red-black-gauss-seidel.cpp @@ -52,7 +52,8 @@ * h - Spacing between grid points * n - Number of grid points */ -struct grid_s { +struct grid_s +{ double o, h; int n; }; @@ -62,16 +63,16 @@ struct grid_s { * solution - Function for the analytic solution * computeErr - Displays the maximum error in the solution * gsColorPolicy - Generates the custom index set for this example -*/ + */ double solution(double x, double y); -void computeErr(double *I, grid_s grid); -RAJA::TypedIndexSet - gsColorPolicy(int N, camp::resources::Resource res); +void computeErr(double* I, grid_s grid); +RAJA::TypedIndexSet +gsColorPolicy(int N, camp::resources::Resource res); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout<<"Red-Black Gauss-Seidel Example"<(NN); + double* I = resource.allocate(NN); memset(I, 0, NN * sizeof(double)); @@ -117,7 +118,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) resI2 = 1; iteration = 0; - while (resI2 > tol * tol) { + while (resI2 > tol * tol) + { #if defined(RAJA_ENABLE_OPENMP) RAJA::ReduceSum RAJA_resI2(0.0); @@ -128,9 +130,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Gauss-Seidel Iteration // - RAJA::forall(colorSet, - [=](RAJA::Index_type id) { - + RAJA::forall(colorSet, [=](RAJA::Index_type id) { // // Compute x,y grid index // @@ -140,21 +140,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) double x = gridx.o + m * gridx.h; double y = gridx.o + n * gridx.h; - double f = gridx.h * gridx.h * + double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); - double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] - - I[id - 1] - I[id + 1]); + double newI = + -0.25 * (f - I[id - N - 2] - I[id + N + 2] - I[id - 1] - I[id + 1]); double oldI = I[id]; RAJA_resI2 += (newI - oldI) * (newI - oldI); I[id] = newI; - }); resI2 = RAJA_resI2; - if (iteration > maxIter) { - std::cout<<"Gauss-Seidel maxed out on iterations"< maxIter) + { + std::cout << "Gauss-Seidel maxed out on iterations" << std::endl; break; } @@ -174,36 +174,40 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to generate RAJA ListSegments and populate a RAJA Static Index // Set. -RAJA::TypedIndexSet - gsColorPolicy(int N, camp::resources::Resource res) +RAJA::TypedIndexSet +gsColorPolicy(int N, camp::resources::Resource res) { RAJA::TypedIndexSet colorSet; - int redN = static_cast( std::ceil( static_cast(N * N / 2) ) ); - int blkN = static_cast( std::floor( static_cast(N * N / 2) ) ); - RAJA::Index_type *Red = new RAJA::Index_type[redN]; - RAJA::Index_type *Blk = new RAJA::Index_type[blkN]; + int redN = static_cast(std::ceil(static_cast(N * N / 2))); + int blkN = static_cast(std::floor(static_cast(N * N / 2))); + RAJA::Index_type* Red = new RAJA::Index_type[redN]; + RAJA::Index_type* Blk = new RAJA::Index_type[blkN]; int ib = 0; int ir = 0; bool isRed = true; - for (int n = 1; n <= N; ++n) { - - for (int m = 1; m <= N; ++m) { - + for (int n = 1; n <= N; ++n) + { + + for (int m = 1; m <= N; ++m) + { + RAJA::Index_type id = n * (N + 2) + m; - if (isRed) { + if (isRed) + { Red[ib] = id; ib++; - } else { + } + else + { Blk[ir] = id; ir++; } isRed = !isRed; } - } // Create Index @@ -227,26 +231,25 @@ double solution(double x, double y) // // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf} // -void computeErr(double *I, grid_s grid) +void computeErr(double* I, grid_s grid) { RAJA::RangeSegment fdBounds(0, grid.n); RAJA::ReduceMax tMax(-1.0); - using errPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; - - RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), - [=] (RAJA::Index_type tx, RAJA::Index_type ty) { - - int id = tx + grid.n * ty; - double x = grid.o + tx * grid.h; - double y = grid.o + ty * grid.h; - double myErr = std::abs(I[id] - solution(x, y)); - tMax.max(myErr); - - }); + using errPolicy = RAJA::KernelPolicy>>>; + + RAJA::kernel(RAJA::make_tuple(fdBounds, fdBounds), + [=](RAJA::Index_type tx, RAJA::Index_type ty) { + int id = tx + grid.n * ty; + double x = grid.o + tx * grid.h; + double y = grid.o + ty * grid.h; + double myErr = std::abs(I[id] - solution(x, y)); + tMax.max(myErr); + }); double l2err = tMax; printf("Max error = %lg, h = %f \n", l2err, grid.h); diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp index 0b35017fac..c4ef22f542 100644 --- a/examples/resource-dynamic-forall.cpp +++ b/examples/resource-dynamic-forall.cpp @@ -28,25 +28,29 @@ void checkResult(int* res, int len); void printResult(int* res, int len); -using policy_list = camp::list - ,RAJA::cuda_exec<512> + , + RAJA::cuda_exec<256>, + RAJA::cuda_exec<512> #endif #if defined(RAJA_ENABLE_HIP) - ,RAJA::hip_exec<256> - ,RAJA::hip_exec<512> + , + RAJA::hip_exec<256>, + RAJA::hip_exec<512> #endif >; -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index of the policy to run"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index " + "of the policy to run"); } // @@ -58,50 +62,55 @@ int main(int argc, char *argv[]) const int pol = std::stoi(argv[1]); RAJA::ExecPlace select_cpu_or_gpu; - if(pol < 2) { + if (pol < 2) + { select_cpu_or_gpu = RAJA::ExecPlace::HOST; - } else { + } + else + { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; } std::cout << "\n\nRAJA vector addition example...\n"; - std::cout << "Using policy # "<(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = -i; b[i] = i; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } // _cstyle_vector_add_end checkResult(c, N); -//printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// Example of dynamic policy selection for forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Example of dynamic policy selection for forall + //----------------------------------------------------------------------------// RAJA::resources::Host host_res; #if defined(RAJA_ENABLE_CUDA) @@ -112,30 +121,32 @@ int main(int argc, char *argv[]) #endif #if defined(RAJA_ENABLE_SYCL) RAJA::resources::Sycl device_res; -#endif +#endif - //Get typed erased resource - it will internally store if we are running on the host or device -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) - RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); + // Get typed erased resource - it will internally store if we are running on + // the host or device +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) + RAJA::resources::Resource res = + RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); #else - RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); + RAJA::resources::Resource res = + RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); #endif - RAJA::expt::dynamic_forall - (res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { - - c[i] = a[i] + b[i]; - - }); + RAJA::expt::dynamic_forall( + res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) { + c[i] = a[i] + b[i]; + }); checkResult(c, N); - //printResult(c, N); + // printResult(c, N); -//----------------------------------------------------------------------------// -// -// Clean up. -// + //----------------------------------------------------------------------------// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -151,12 +162,19 @@ int main(int argc, char *argv[]) void checkResult(int* res, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( res[i] != 0 ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (res[i] != 0) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -167,7 +185,8 @@ void checkResult(int* res, int len) void printResult(int* res, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << res[i] << std::endl; } std::cout << std::endl; diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp index b374bdba3f..8d729c4368 100644 --- a/examples/resource-forall.cpp +++ b/examples/resource-forall.cpp @@ -18,7 +18,7 @@ * Vector Addition Example * * Computes c = a + b, where a, b, c are vectors of ints. - * It illustrates similarities between a C-style for-loop and a RAJA + * It illustrates similarities between a C-style for-loop and a RAJA * forall loop. * * RAJA features shown: @@ -35,279 +35,275 @@ // // Functions for checking and printing results // -void checkResult(int* res, int len); +void checkResult(int* res, int len); void printResult(int* res, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA vector addition example...\n"; -// -// Define vector length -// + // + // Define vector length + // const int N = 100000; -// -// Allocate and initialize vector data -// + // + // Allocate and initialize vector data + // RAJA::resources::Host host{}; - int *a = host.allocate(N); - int *b = host.allocate(N); - int *c = host.allocate(N); + int* a = host.allocate(N); + int* b = host.allocate(N); + int* c = host.allocate(N); - int *a_ = host.allocate(N); - int *b_ = host.allocate(N); - int *c_ = host.allocate(N); + int* a_ = host.allocate(N); + int* b_ = host.allocate(N); + int* c_ = host.allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = -i; b[i] = 2 * i; a_[i] = -i; b_[i] = 2 * i; - } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style vector addition...\n"; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces sequential execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces sequential execution.... + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential vector addition...\n"; - RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall( + host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::sind_exec policy enforces simd execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::sind_exec policy enforces simd execution.... + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA simd_exec vector addition...\n"; - RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall( + host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// RAJA::omp_for_parallel_exec policy execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_for_parallel_exec policy execution.... + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n"; - RAJA::forall(host, RAJA::RangeSegment(0, N), - [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall( + host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_static_exec policy execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_static_exec policy execution.... + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA omp_parallel_for_static_exec (default chunksize) vector addition...\n"; + std::cout << "\n Running RAJA omp_parallel_for_static_exec (default " + "chunksize) vector addition...\n"; - RAJA::forall>(host, RAJA::RangeSegment(0, N), - [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall>( + host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_dynamic_exec policy execution.... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_dynamic_exec policy execution.... + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector addition...\n"; + std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector " + "addition...\n"; - RAJA::forall>(host, RAJA::RangeSegment(0, N), - [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall>( + host, RAJA::RangeSegment(0, N), [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, N); #endif +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) - -/* - GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block -*/ -const int GPU_BLOCK_SIZE = 256; + /* + GPU_BLOCK_SIZE - specifies the number of threads in a CUDA/HIP thread block + */ + const int GPU_BLOCK_SIZE = 256; -//----------------------------------------------------------------------------// -// RAJA::cuda/hip_exec policy execution.... -//----------------------------------------------------------------------------// -{ - std::cout << "\n Running RAJA GPU vector addition on 2 seperate streams...\n"; + //----------------------------------------------------------------------------// + // RAJA::cuda/hip_exec policy execution.... + //----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA GPU vector addition on 2 seperate " + "streams...\n"; #if defined(RAJA_ENABLE_CUDA) - RAJA::resources::Cuda res_gpu1; - RAJA::resources::Cuda res_gpu2; - using EXEC_POLICY = RAJA::cuda_exec_async; + RAJA::resources::Cuda res_gpu1; + RAJA::resources::Cuda res_gpu2; + using EXEC_POLICY = RAJA::cuda_exec_async; #elif defined(RAJA_ENABLE_HIP) - RAJA::resources::Hip res_gpu1; - RAJA::resources::Hip res_gpu2; - using EXEC_POLICY = RAJA::hip_exec_async; + RAJA::resources::Hip res_gpu1; + RAJA::resources::Hip res_gpu2; + using EXEC_POLICY = RAJA::hip_exec_async; #elif defined(RAJA_ENABLE_SYCL) -RAJA::resources::Sycl res_gpu1; -RAJA::resources::Sycl res_gpu2; -using EXEC_POLICY = RAJA::sycl_exec; + RAJA::resources::Sycl res_gpu1; + RAJA::resources::Sycl res_gpu2; + using EXEC_POLICY = RAJA::sycl_exec; #endif - int* d_a1 = res_gpu1.allocate(N); - int* d_b1 = res_gpu1.allocate(N); - int* d_c1 = res_gpu1.allocate(N); + int* d_a1 = res_gpu1.allocate(N); + int* d_b1 = res_gpu1.allocate(N); + int* d_c1 = res_gpu1.allocate(N); - int* d_a2 = res_gpu2.allocate(N); - int* d_b2 = res_gpu2.allocate(N); - int* d_c2 = res_gpu2.allocate(N); + int* d_a2 = res_gpu2.allocate(N); + int* d_b2 = res_gpu2.allocate(N); + int* d_c2 = res_gpu2.allocate(N); - res_gpu1.memcpy(d_a1, a, sizeof(int)* N); - res_gpu1.memcpy(d_b1, b, sizeof(int)* N); + res_gpu1.memcpy(d_a1, a, sizeof(int) * N); + res_gpu1.memcpy(d_b1, b, sizeof(int) * N); - res_gpu2.memcpy(d_a2, a, sizeof(int)* N); - res_gpu2.memcpy(d_b2, b, sizeof(int)* N); + res_gpu2.memcpy(d_a2, a, sizeof(int) * N); + res_gpu2.memcpy(d_b2, b, sizeof(int) * N); - RAJA::forall(res_gpu1, RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c1[i] = d_a1[i] + d_b1[i]; - }); + RAJA::forall( + res_gpu1, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { + d_c1[i] = d_a1[i] + d_b1[i]; + }); - RAJA::forall(res_gpu2, RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c2[i] = d_a2[i] + d_b2[i]; - }); + RAJA::forall( + res_gpu2, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { + d_c2[i] = d_a2[i] + d_b2[i]; + }); - res_gpu1.memcpy(c, d_c1, sizeof(int)*N ); + res_gpu1.memcpy(c, d_c1, sizeof(int) * N); - res_gpu2.memcpy(c_, d_c2, sizeof(int)*N ); + res_gpu2.memcpy(c_, d_c2, sizeof(int) * N); - checkResult(c, N); - checkResult(c_, N); + checkResult(c, N); + checkResult(c_, N); - res_gpu1.deallocate(d_a1); - res_gpu1.deallocate(d_b1); - res_gpu1.deallocate(d_c1); + res_gpu1.deallocate(d_a1); + res_gpu1.deallocate(d_b1); + res_gpu1.deallocate(d_c1); - res_gpu2.deallocate(d_a2); - res_gpu2.deallocate(d_b2); - res_gpu2.deallocate(d_c2); -} + res_gpu2.deallocate(d_a2); + res_gpu2.deallocate(d_b2); + res_gpu2.deallocate(d_c2); + } -//----------------------------------------------------------------------------// -// RAJA::cuda/hip_exec policy with waiting event.... -//----------------------------------------------------------------------------// -{ - std::cout << "\n Running RAJA GPU vector with dependency between two seperate streams...\n"; + //----------------------------------------------------------------------------// + // RAJA::cuda/hip_exec policy with waiting event.... + //----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA GPU vector with dependency between two " + "seperate streams...\n"; #if defined(RAJA_ENABLE_CUDA) - // _raja_res_defres_start - RAJA::resources::Cuda res_gpu1; - RAJA::resources::Cuda res_gpu2; - RAJA::resources::Host res_host; + // _raja_res_defres_start + RAJA::resources::Cuda res_gpu1; + RAJA::resources::Cuda res_gpu2; + RAJA::resources::Host res_host; - using EXEC_POLICY = RAJA::cuda_exec_async; - // _raja_res_defres_end + using EXEC_POLICY = RAJA::cuda_exec_async; + // _raja_res_defres_end #elif defined(RAJA_ENABLE_HIP) - RAJA::resources::Hip res_gpu1; - RAJA::resources::Hip res_gpu2; - RAJA::resources::Host res_host; + RAJA::resources::Hip res_gpu1; + RAJA::resources::Hip res_gpu2; + RAJA::resources::Host res_host; - using EXEC_POLICY = RAJA::hip_exec_async; + using EXEC_POLICY = RAJA::hip_exec_async; #elif defined(RAJA_ENABLE_SYCL) - RAJA::resources::Sycl res_gpu1; - RAJA::resources::Sycl res_gpu2; - RAJA::resources::Host res_host; + RAJA::resources::Sycl res_gpu1; + RAJA::resources::Sycl res_gpu2; + RAJA::resources::Host res_host; - using EXEC_POLICY = RAJA::sycl_exec; + using EXEC_POLICY = RAJA::sycl_exec; #endif - // _raja_res_alloc_start - int* d_array1 = res_gpu1.allocate(N); - int* d_array2 = res_gpu2.allocate(N); - int* h_array = res_host.allocate(N); - // _raja_res_alloc_end - - // _raja_res_k1_start - RAJA::forall(res_gpu1, RAJA::RangeSegment(0,N), - [=] RAJA_HOST_DEVICE (int i) { - d_array1[i] = i; - } - ); - // _raja_res_k1_end - - // _raja_res_k2_start - RAJA::resources::Event e = RAJA::forall(res_gpu2, RAJA::RangeSegment(0,N), - [=] RAJA_HOST_DEVICE (int i) { - d_array2[i] = -1; - } - ); - // _raja_res_k2_end - - // _raja_res_wait_start - res_gpu2.wait_for(&e); - // _raja_res_wait_end - - // _raja_res_k3_start - RAJA::forall(res_gpu1, RAJA::RangeSegment(0,N), - [=] RAJA_HOST_DEVICE (int i) { - d_array1[i] *= d_array2[i]; - } - ); - // _raja_res_k3_end - - // _raja_res_memcpy_start - res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N); - // _raja_res_memcpy_end - - // _raja_res_k4_start - bool check = true; - RAJA::forall(res_host, RAJA::RangeSegment(0,N), - [&check, h_array] (int i) { - if(h_array[i] != -i) {check = false;} - } - ); - // _raja_res_k4_end - - std::cout << "\n result -- "; - if (check) std::cout << "PASS\n"; - else std::cout << "FAIL\n"; - - res_gpu1.deallocate(d_array1); - res_gpu2.deallocate(d_array2); - res_host.deallocate(h_array); - -} + // _raja_res_alloc_start + int* d_array1 = res_gpu1.allocate(N); + int* d_array2 = res_gpu2.allocate(N); + int* h_array = res_host.allocate(N); + // _raja_res_alloc_end + + // _raja_res_k1_start + RAJA::forall(res_gpu1, + RAJA::RangeSegment(0, N), + [=] RAJA_HOST_DEVICE(int i) { d_array1[i] = i; }); + // _raja_res_k1_end + + // _raja_res_k2_start + RAJA::resources::Event e = RAJA::forall( + res_gpu2, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) { + d_array2[i] = -1; + }); + // _raja_res_k2_end + + // _raja_res_wait_start + res_gpu2.wait_for(&e); + // _raja_res_wait_end + + // _raja_res_k3_start + RAJA::forall( + res_gpu1, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE(int i) { + d_array1[i] *= d_array2[i]; + }); + // _raja_res_k3_end + + // _raja_res_memcpy_start + res_gpu1.memcpy(h_array, d_array1, sizeof(int) * N); + // _raja_res_memcpy_end + + // _raja_res_k4_start + bool check = true; + RAJA::forall( + res_host, RAJA::RangeSegment(0, N), [&check, h_array](int i) { + if (h_array[i] != -i) + { + check = false; + } + }); + // _raja_res_k4_end + + std::cout << "\n result -- "; + if (check) + std::cout << "PASS\n"; + else + std::cout << "FAIL\n"; + + res_gpu1.deallocate(d_array1); + res_gpu2.deallocate(d_array2); + res_host.deallocate(h_array); + } #endif -// -// -// Clean up. -// + // + // + // Clean up. + // host.deallocate(a); host.deallocate(b); host.deallocate(c); @@ -324,15 +320,22 @@ using EXEC_POLICY = RAJA::sycl_exec; // // Function to check result and report P/F. // -void checkResult(int* res, int len) +void checkResult(int* res, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( res[i] != i ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (res[i] != i) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -343,7 +346,8 @@ void checkResult(int* res, int len) void printResult(int* res, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << res[i] << std::endl; } std::cout << std::endl; diff --git a/examples/resource-kernel.cpp b/examples/resource-kernel.cpp index a754876479..a38f5c83a1 100644 --- a/examples/resource-kernel.cpp +++ b/examples/resource-kernel.cpp @@ -10,7 +10,7 @@ using namespace RAJA; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { #if defined(RAJA_ENABLE_CUDA) @@ -21,54 +21,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()}; RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()}; - int* d_array = def_cuda_res.allocate(N*M); - int* h_array = def_host_res.allocate(N*M); + int* d_array = def_cuda_res.allocate(N * M); + int* h_array = def_host_res.allocate(N * M); RAJA::RangeSegment one_range(0, 1); RAJA::RangeSegment m_range(0, M); RAJA::RangeSegment n_range(0, N); - using TEST_POL = - RAJA::KernelPolicy< - statement::CudaKernelAsync< - statement::For<0, cuda_block_x_loop, - statement::For<1, cuda_thread_x_loop, - statement::Lambda<0> - > - > - > - >; + using TEST_POL = RAJA::KernelPolicy>>>>; - RAJA::forall(def_host_res, n_range, - [=, &def_cuda_res](int i){ - RAJA::resources::Cuda res_cuda; + RAJA::forall( + def_host_res, n_range, [=, &def_cuda_res](int i) { + RAJA::resources::Cuda res_cuda; - RAJA::resources::Event e = RAJA::kernel_resource( - RAJA::make_tuple(one_range, - m_range), + RAJA::resources::Event e = RAJA::kernel_resource( + RAJA::make_tuple(one_range, m_range), - res_cuda, + res_cuda, - [=] RAJA_DEVICE (int k, int j) { - d_array[i*M + j] = i * M + j; - } - ); + [=] RAJA_DEVICE(int k, int j) { d_array[i * M + j] = i * M + j; }); - def_cuda_res.wait_for(&e); - } - ); + def_cuda_res.wait_for(&e); + }); def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); int ec_count = 0; - RAJA::forall( RAJA::RangeSegment(0, N*M), - [=, &ec_count](int i){ - if (h_array[i] != i) ec_count++; - } - ); + RAJA::forall(RAJA::RangeSegment(0, N * M), + [=, &ec_count](int i) { + if (h_array[i] != i) ec_count++; + }); std::cout << " Result -- "; - if (ec_count > 0) + if (ec_count > 0) std::cout << "FAIL : error count = " << ec_count << "\n"; else std::cout << "PASS!\n"; diff --git a/examples/resource-launch.cpp b/examples/resource-launch.cpp index 288b70f8a5..12c228e91e 100644 --- a/examples/resource-launch.cpp +++ b/examples/resource-launch.cpp @@ -10,7 +10,7 @@ using namespace RAJA; -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { #if defined(RAJA_ENABLE_CUDA) @@ -21,8 +21,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()}; RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()}; - int* d_array = def_cuda_res.allocate(N*M); - int* h_array = def_host_res.allocate(N*M); + int* d_array = def_cuda_res.allocate(N * M); + int* h_array = def_host_res.allocate(N * M); RAJA::RangeSegment one_range(0, 1); RAJA::RangeSegment m_range(0, M); @@ -34,39 +34,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using threads_x = RAJA::LoopPolicy; - RAJA::forall(def_host_res, n_range, - [=, &def_cuda_res](int i){ - - RAJA::resources::Cuda res_cuda; - - RAJA::resources::Event e = - RAJA::launch(res_cuda, - RAJA::LaunchParams(RAJA::Teams(64), - RAJA::Threads(1)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, m_range, [&] (int j) { - RAJA::loop(ctx, one_range, [&] (int k) { - - d_array[i*M + j] = i * M + j; - - }); - }); - + RAJA::forall( + def_host_res, n_range, [=, &def_cuda_res](int i) { + RAJA::resources::Cuda res_cuda; + + RAJA::resources::Event e = RAJA::launch( + res_cuda, + RAJA::LaunchParams(RAJA::Teams(64), RAJA::Threads(1)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, m_range, [&](int j) { + RAJA::loop(ctx, one_range, [&](int k) { + d_array[i * M + j] = i * M + j; + }); + }); + }); + + def_cuda_res.wait_for(&e); }); - def_cuda_res.wait_for(&e); - } - ); - def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); int ec_count = 0; - RAJA::forall( RAJA::RangeSegment(0, N*M), - [=, &ec_count](int i){ - if (h_array[i] != i) ec_count++; - } - ); + RAJA::forall(RAJA::RangeSegment(0, N * M), + [=, &ec_count](int i) { + if (h_array[i] != i) ec_count++; + }); std::cout << " Result -- "; if (ec_count > 0) diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp index e52923d81f..87c10fa871 100644 --- a/examples/resource-runtime-launch.cpp +++ b/examples/resource-runtime-launch.cpp @@ -42,15 +42,17 @@ using device_loop = RAJA::hip_global_thread_x; using launch_policy = RAJA::LaunchPolicy; + >; using loop_pol = RAJA::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) using reduce_policy = RAJA::cuda_reduce; @@ -60,11 +62,13 @@ using reduce_policy = RAJA::hip_reduce; using reduce_policy = RAJA::seq_reduce; #endif -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions " + "device"); } // @@ -73,39 +77,51 @@ int main(int argc, char *argv[]) // Example usage ./teams_reductions host or ./teams_reductions device // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions " + "device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams reductions example on the host \n"); } - if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); } + if (exec_space.compare("host") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::HOST; + printf("Running RAJA-Teams reductions example on the host \n"); + } + if (exec_space.compare("device") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; + printf("Running RAJA-Teams reductions example on the device \n"); + } // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // const int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { + } + else + { a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // const int minloc_ref = N / 2; a[minloc_ref] = -100; @@ -113,35 +129,39 @@ int main(int argc, char *argv[]) a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // -// -// Define index range for iterating over a elements in all examples -// + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::RangeSegment arange(0, N); // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// RAJA::ReduceSum kernel_sum(0); - RAJA::ReduceMin kernel_min(std::numeric_limits::max()); - RAJA::ReduceMax kernel_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc kernel_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc kernel_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMin kernel_min( + std::numeric_limits::max()); + RAJA::ReduceMax kernel_max( + std::numeric_limits::min()); + RAJA::ReduceMinLoc kernel_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc kernel_maxloc( + std::numeric_limits::min(), -1); const int TEAM_SZ = 256; - const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ); + const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N, TEAM_SZ); RAJA::resources::Host host_res; @@ -152,44 +172,47 @@ int main(int argc, char *argv[]) RAJA::resources::Hip device_res; #endif - //Get typed erased resource - it will internally store if we are running on the host or device + // Get typed erased resource - it will internally store if we are running on + // the host or device #if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL) - RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); + RAJA::resources::Resource res = + RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); #else - RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); + RAJA::resources::Resource res = + RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); #endif - //How the kernel executes now depends on how the resource is constructed (host or device) - RAJA::launch - (res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), - RAJA::Threads(TEAM_SZ)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, arange, [&] (int i) { - - kernel_sum += a[i]; + // How the kernel executes now depends on how the resource is constructed + // (host or device) + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, arange, [&](int i) { + kernel_sum += a[i]; - kernel_min.min(a[i]); - kernel_max.max(a[i]); + kernel_min.min(a[i]); + kernel_max.max(a[i]); - kernel_minloc.minloc(a[i], i); - kernel_maxloc.maxloc(a[i], i); - }); - }); + kernel_minloc.minloc(a[i], i); + kernel_maxloc.maxloc(a[i], i); + }); + }); std::cout << "\tsum = " << kernel_sum.get() << std::endl; std::cout << "\tmin = " << kernel_min.get() << std::endl; std::cout << "\tmax = " << kernel_max.get() << std::endl; std::cout << "\tmin, loc = " << kernel_minloc.get() << " , " - << kernel_minloc.getLoc() << std::endl; + << kernel_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << kernel_maxloc.get() << " , " - << kernel_maxloc.getLoc() << std::endl; + << kernel_maxloc.getLoc() << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/tut_daxpy.cpp b/examples/tut_daxpy.cpp index 74b127e0d6..39ecaad085 100644 --- a/examples/tut_daxpy.cpp +++ b/examples/tut_daxpy.cpp @@ -15,12 +15,12 @@ * Daxpy Example * * Computes a += b*c, where a, b are vectors of doubles - * and c is a scalar double. It illustrates similarities between a - * C-style for-loop and a RAJA forall loop. + * and c is a scalar double. It illustrates similarities between a + * C-style for-loop and a RAJA forall loop. * * RAJA features shown: * - `forall` loop iteration template method - * - Index range segment + * - Index range segment * - Execution policies */ @@ -28,187 +28,184 @@ // Functions for checking and printing results // void checkResult(double* v1, double* v2, int len); -void printResult(double* v, int len); +void printResult(double* v, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA daxpy example...\n"; -// -// Define vector length -// + // + // Define vector length + // const int N = 1000000; -// -// Allocate and initialize vector data. -// + // + // Allocate and initialize vector data. + // double* a0 = new double[N]; double* aref = new double[N]; double* ta = new double[N]; double* tb = new double[N]; - + double c = 3.14159; - - for (int i = 0; i < N; i++) { + + for (int i = 0; i < N; i++) + { a0[i] = 1.0; tb[i] = 2.0; } -// -// Declare and set pointers to array data. -// We reset them for each daxpy version so that -// they all look the same. -// + // + // Declare and set pointers to array data. + // We reset them for each daxpy version so that + // they all look the same. + // double* a = ta; double* b = tb; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - for (int i = 0; i < N; ++i) { + std::memcpy(a, a0, N * sizeof(double)); + + for (int i = 0; i < N; ++i) + { a[i] += b[i] * c; } - std::memcpy( aref, a, N* sizeof(double) ); + std::memcpy(aref, a, N * sizeof(double)); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// In the following, we show a RAJA version -// of the daxpy operation and how it can -// be run differently by choosing different -// RAJA execution policies. -// -// Note that the only thing that changes in -// these versions is the execution policy. -// To implement these cases using the -// programming model choices directly, would -// require unique changes for each. -// - -//----------------------------------------------------------------------------// + // + // In the following, we show a RAJA version + // of the daxpy operation and how it can + // be run differently by choosing different + // RAJA execution policies. + // + // Note that the only thing that changes in + // these versions is the execution policy. + // To implement these cases using the + // programming model choices directly, would + // require unique changes for each. + // + + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - a[i] += b[i] * c; - }); + std::memcpy(a, a0, N * sizeof(double)); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) { a[i] += b[i] * c; }); checkResult(a, aref, N); -//printResult(a, N); + // printResult(a, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// RAJA SIMD version. -// + // + // RAJA SIMD version. + // std::cout << "\n Running RAJA SIMD daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - a[i] += b[i] * c; - }); + std::memcpy(a, a0, N * sizeof(double)); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) { a[i] += b[i] * c; }); checkResult(a, aref, N); -//printResult(a, N); + // printResult(a, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - a[i] += b[i] * c; - }); + std::memcpy(a, a0, N * sizeof(double)); + + RAJA::forall(RAJA::RangeSegment(0, N), + [=](int i) { a[i] += b[i] * c; }); checkResult(a, aref, N); -//printResult(a, N); +// printResult(a, N); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -// -// RAJA CUDA parallel GPU version (256 threads per thread block). -// + // + // RAJA CUDA parallel GPU version (256 threads per thread block). + // std::cout << "\n Running RAJA CUDA daxpy...\n"; - a = 0; b = 0; - cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) )); - cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) )); - - cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); - cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); + a = 0; + b = 0; + cudaErrchk(cudaMalloc((void**)&a, N * sizeof(double))); + cudaErrchk(cudaMalloc((void**)&b, N * sizeof(double))); - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - a[i] += b[i] * c; - }); + cudaErrchk(cudaMemcpy(a, a0, N * sizeof(double), cudaMemcpyHostToDevice)); + cudaErrchk(cudaMemcpy(b, tb, N * sizeof(double), cudaMemcpyHostToDevice)); + + RAJA::forall>( + RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { a[i] += b[i] * c; }); - cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(ta, a, N * sizeof(double), cudaMemcpyDeviceToHost)); cudaErrchk(cudaFree(a)); cudaErrchk(cudaFree(b)); a = ta; checkResult(a, aref, N); -//printResult(a, N); +// printResult(a, N); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -// -// RAJA HIP parallel GPU version (256 threads per thread block). -// + // + // RAJA HIP parallel GPU version (256 threads per thread block). + // std::cout << "\n Running RAJA HIP daxpy...\n"; - a = 0; b = 0; - hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) )); - hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) )); + a = 0; + b = 0; + hipErrchk(hipMalloc((void**)&a, N * sizeof(double))); + hipErrchk(hipMalloc((void**)&b, N * sizeof(double))); - hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(a, a0, N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(b, tb, N * sizeof(double), hipMemcpyHostToDevice)); - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - a[i] += b[i] * c; - }); + RAJA::forall>( + RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { a[i] += b[i] * c; }); - hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(ta, a, N * sizeof(double), hipMemcpyDeviceToHost)); hipErrchk(hipFree(a)); hipErrchk(hipFree(b)); a = ta; checkResult(a, aref, N); -//printResult(a, N); +// printResult(a, N); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// - delete[] a0; - delete[] aref; - delete[] ta; + // + // Clean up. + // + delete[] a0; + delete[] aref; + delete[] ta; delete[] tb; - + std::cout << "\n DONE!...\n"; return 0; @@ -217,26 +214,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Function to compare result to reference and report P/F. // -void checkResult(double* v1, double* v2, int len) +void checkResult(double* v1, double* v2, int len) { bool match = true; - for (int i = 0; i < len; i++) { - if ( v1[i] != v2[i] ) { match = false; } + for (int i = 0; i < len; i++) + { + if (v1[i] != v2[i]) + { + match = false; + } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; - } + } } // -// Function to print result. +// Function to print result. // -void printResult(double* v, int len) +void printResult(double* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "result[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp index c584695128..0ce21573bd 100644 --- a/examples/tut_halo-exchange.cpp +++ b/examples/tut_halo-exchange.cpp @@ -34,8 +34,9 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using forall - CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when using workgroup + CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block when + using forall CUDA_WORKGROUP_BLOCK_SIZE - specifies the number of threads in a + CUDA thread block when using workgroup */ #if defined(RAJA_ENABLE_CUDA) const int CUDA_BLOCK_SIZE = 256; @@ -56,42 +57,51 @@ const int num_neighbors = 26; // // Functions for checking and printing results // -void checkResult(std::vector const& vars, std::vector const& vars_ref, - int var_size, int num_vars); +void checkResult(std::vector const& vars, + std::vector const& vars_ref, + int var_size, + int num_vars); void printResult(std::vector const& vars, int var_size, int num_vars); // // Functions for allocating and populating packing and unpacking lists // -void create_pack_lists(std::vector& pack_index_lists, std::vector& pack_index_list_lengths, - const int halo_width, const int* grid_dims); -void create_unpack_lists(std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, - const int halo_width, const int* grid_dims); +void create_pack_lists(std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const int halo_width, + const int* grid_dims); +void create_unpack_lists(std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const int halo_width, + const int* grid_dims); void destroy_pack_lists(std::vector& pack_index_lists); void destroy_unpack_lists(std::vector& unpack_index_lists); -template < typename T > +template struct memory_manager_allocator { using value_type = T; memory_manager_allocator() = default; - template < typename U > - constexpr memory_manager_allocator(memory_manager_allocator const&) noexcept - { } + template + constexpr memory_manager_allocator( + memory_manager_allocator const&) noexcept + {} /*[[nodiscard]]*/ value_type* allocate(size_t num) { - if (num > std::numeric_limits::max() / sizeof(value_type)) { + if (num > std::numeric_limits::max() / sizeof(value_type)) + { throw std::bad_alloc(); } - value_type *ptr = memoryManager::allocate(num); + value_type* ptr = memoryManager::allocate(num); - if (!ptr) { + if (!ptr) + { throw std::bad_alloc(); } @@ -106,45 +116,49 @@ struct memory_manager_allocator }; template -bool operator==(memory_manager_allocator const&, memory_manager_allocator const&) +bool operator==(memory_manager_allocator const&, + memory_manager_allocator const&) { return true; } template -bool operator!=(memory_manager_allocator const& lhs, memory_manager_allocator const& rhs) +bool operator!=(memory_manager_allocator const& lhs, + memory_manager_allocator const& rhs) { return !(lhs == rhs); } #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) -template < typename T > +template struct pinned_allocator { using value_type = T; pinned_allocator() = default; - template < typename U > + template constexpr pinned_allocator(pinned_allocator const&) noexcept - { } + {} /*[[nodiscard]]*/ value_type* allocate(size_t num) { - if (num > std::numeric_limits::max() / sizeof(value_type)) { + if (num > std::numeric_limits::max() / sizeof(value_type)) + { throw std::bad_alloc(); } - value_type *ptr = nullptr; + value_type* ptr = nullptr; #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaMallocHost((void **)&ptr, num*sizeof(value_type))); + cudaErrchk(cudaMallocHost((void**)&ptr, num * sizeof(value_type))); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipHostMalloc((void **)&ptr, num*sizeof(value_type))); + hipErrchk(hipHostMalloc((void**)&ptr, num * sizeof(value_type))); #endif - if (!ptr) { + if (!ptr) + { throw std::bad_alloc(); } @@ -176,12 +190,13 @@ bool operator!=(pinned_allocator const& lhs, pinned_allocator const& rhs) #endif -int main(int argc, char **argv) +int main(int argc, char** argv) { std::cout << "\n\nRAJA halo exchange example...\n"; - if (argc != 1 && argc != 7) { + if (argc != 1 && argc != 7) + { std::cerr << "Usage: tut_halo-exchange " << "[grid_x grid_y grid_z halo_width num_vars num_cycles]\n"; std::exit(1); @@ -194,47 +209,46 @@ int main(int argc, char **argv) // Define number of grid variables // Define number of cycles // - const int grid_dims[3] = { (argc != 7) ? 100 : std::atoi(argv[1]), - (argc != 7) ? 100 : std::atoi(argv[2]), - (argc != 7) ? 100 : std::atoi(argv[3]) }; - const int halo_width = (argc != 7) ? 1 : std::atoi(argv[4]); - const int num_vars = (argc != 7) ? 3 : std::atoi(argv[5]); - const int num_cycles = (argc != 7) ? 3 : std::atoi(argv[6]); + const int grid_dims[3] = {(argc != 7) ? 100 : std::atoi(argv[1]), + (argc != 7) ? 100 : std::atoi(argv[2]), + (argc != 7) ? 100 : std::atoi(argv[3])}; + const int halo_width = (argc != 7) ? 1 : std::atoi(argv[4]); + const int num_vars = (argc != 7) ? 3 : std::atoi(argv[5]); + const int num_cycles = (argc != 7) ? 3 : std::atoi(argv[6]); // _halo_exchange_input_params_end - std::cout << "grid dimensions " << grid_dims[0] - << " x " << grid_dims[1] - << " x " << grid_dims[2] << "\n" - << "halo width " << halo_width << "\n" - << "number of variables " << num_vars << "\n" - << "number of cycles " << num_cycles << "\n"; + std::cout << "grid dimensions " << grid_dims[0] << " x " << grid_dims[1] + << " x " << grid_dims[2] << "\n" + << "halo width " << halo_width << "\n" + << "number of variables " << num_vars << "\n" + << "number of cycles " << num_cycles << "\n"; - if ( grid_dims[0] < halo_width || - grid_dims[1] < halo_width || - grid_dims[2] < halo_width ) { + if (grid_dims[0] < halo_width || grid_dims[1] < halo_width || + grid_dims[2] < halo_width) + { std::cerr << "Error: " << "grid dimensions must not be smaller than the halo width\n"; std::exit(1); } - const int grid_plus_halo_dims[3] = { grid_dims[0] + 2*halo_width, - grid_dims[1] + 2*halo_width, - grid_dims[2] + 2*halo_width }; + const int grid_plus_halo_dims[3] = {grid_dims[0] + 2 * halo_width, + grid_dims[1] + 2 * halo_width, + grid_dims[2] + 2 * halo_width}; - const int var_size = grid_plus_halo_dims[0] * - grid_plus_halo_dims[1] * - grid_plus_halo_dims[2] ; + const int var_size = + grid_plus_halo_dims[0] * grid_plus_halo_dims[1] * grid_plus_halo_dims[2]; // _halo_exchange_vars_allocate_start // // Allocate grid variables and reference grid variables used to check // correctness. // - std::vector vars (num_vars, nullptr); + std::vector vars(num_vars, nullptr); std::vector vars_ref(num_vars, nullptr); - for (int v = 0; v < num_vars; ++v) { - vars[v] = memoryManager::allocate(var_size); + for (int v = 0; v < num_vars; ++v) + { + vars[v] = memoryManager::allocate(var_size); vars_ref[v] = memoryManager::allocate(var_size); } // _halo_exchange_vars_allocate_end @@ -245,12 +259,14 @@ int main(int argc, char **argv) // Generate index lists for packing and unpacking // std::vector pack_index_lists(num_neighbors, nullptr); - std::vector pack_index_list_lengths(num_neighbors, 0); - create_pack_lists(pack_index_lists, pack_index_list_lengths, halo_width, grid_dims); + std::vector pack_index_list_lengths(num_neighbors, 0); + create_pack_lists( + pack_index_lists, pack_index_list_lengths, halo_width, grid_dims); std::vector unpack_index_lists(num_neighbors, nullptr); - std::vector unpack_index_list_lengths(num_neighbors, 0); - create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims); + std::vector unpack_index_list_lengths(num_neighbors, 0); + create_unpack_lists( + unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims); // _halo_exchange_index_list_generate_end @@ -263,7 +279,7 @@ int main(int argc, char **argv) auto timer = RAJA::Timer(); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// { std::cout << "\n Running C-style halo exchange...\n"; @@ -272,74 +288,82 @@ int main(int argc, char **argv) std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate(buffer_len); - } - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - for (int i = 0; i < var_size; i++) { - var[i] = i + v; + for (int i = 0; i < var_size; i++) + { + var[i] = i + v; + } } - } - // _halo_exchange_sequential_cstyle_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_sequential_cstyle_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - for (int i = 0; i < len; i++) { - buffer[i] = var[list[i]]; + for (int i = 0; i < len; i++) + { + buffer[i] = var[list[i]]; + } + + buffer += len; } - buffer += len; + // send single message } + // _halo_exchange_sequential_cstyle_packing_end - // send single message - } - // _halo_exchange_sequential_cstyle_packing_end + // _halo_exchange_sequential_cstyle_unpacking_start + for (int l = 0; l < num_neighbors; ++l) + { - // _halo_exchange_sequential_cstyle_unpacking_start - for (int l = 0; l < num_neighbors; ++l) { + // recv single message - // recv single message + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + // unpack + for (int v = 0; v < num_vars; ++v) + { - // unpack - for (int v = 0; v < num_vars; ++v) { + double* var = vars[v]; - double* var = vars[v]; + for (int i = 0; i < len; i++) + { + var[list[i]] = buffer[i]; + } - for (int i = 0; i < len; i++) { - var[list[i]] = buffer[i]; + buffer += len; } - - buffer += len; } - } - // _halo_exchange_sequential_cstyle_unpacking_end - + // _halo_exchange_sequential_cstyle_unpacking_end } timer.stop(); @@ -348,30 +372,33 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate(buffers[l]); - } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // copy result of exchange for reference later - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; double* var_ref = vars_ref[v]; - for (int i = 0; i < var_size; i++) { + for (int i = 0; i < var_size; i++) + { var_ref[i] = var[i]; } } } -//----------------------------------------------------------------------------// -// Separate packing/unpacking loops using forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Separate packing/unpacking loops using forall + //----------------------------------------------------------------------------// { std::cout << "\n Running RAJA loop forall halo exchange...\n"; @@ -383,74 +410,78 @@ int main(int argc, char **argv) std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate(buffer_len); - } - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] (int i) { - var[i] = i + v; - }); - } + RAJA::forall(range_segment(0, var_size), + [=](int i) { var[i] = i + v; }); + } - // _halo_exchange_seq_forall_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_seq_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] (int i) { - buffer[i] = var[list[i]]; - }); + RAJA::forall(range_segment(0, len), [=](int i) { + buffer[i] = var[list[i]]; + }); - buffer += len; - } + buffer += len; + } - // send single message - } - // _halo_exchange_seq_forall_packing_end + // send single message + } + // _halo_exchange_seq_forall_packing_end - // _halo_exchange_seq_forall_unpacking_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_seq_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) + { - // recv single message + // recv single message - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] (int i) { - var[list[i]] = buffer[i]; - }); + RAJA::forall(range_segment(0, len), [=](int i) { + var[list[i]] = buffer[i]; + }); - buffer += len; + buffer += len; + } } - } - // _halo_exchange_seq_forall_unpacking_end - + // _halo_exchange_seq_forall_unpacking_end } timer.stop(); @@ -459,136 +490,139 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate(buffers[l]); - } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } -//----------------------------------------------------------------------------// -// RAJA::WorkGroup with allows deferred execution -// This has overhead and indirection not in the separate loop version, -// but can be useful for debugging. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::WorkGroup with allows deferred execution + // This has overhead and indirection not in the separate loop version, + // but can be useful for debugging. + //----------------------------------------------------------------------------// { - std::cout << "\n Running RAJA loop workgroup halo exchange...\n"; + std::cout << "\n Running RAJA loop workgroup halo exchange...\n"; double minCycle = std::numeric_limits::max(); // _halo_exchange_seq_workgroup_policies_start using forall_policy = RAJA::seq_exec; - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::seq_work, - RAJA::ordered, - RAJA::ragged_array_of_objects, - RAJA::indirect_function_call_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - int, - RAJA::xargs<>, - memory_manager_allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - int, - RAJA::xargs<>, - memory_manager_allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - int, - RAJA::xargs<>, - memory_manager_allocator >; + using workgroup_policy = + RAJA::WorkGroupPolicy; + + using workpool = RAJA::WorkPool, + memory_manager_allocator>; + + using workgroup = RAJA::WorkGroup, + memory_manager_allocator>; + + using worksite = RAJA::WorkSite, + memory_manager_allocator>; // _halo_exchange_seq_workgroup_policies_end std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate(buffer_len); - } - workpool pool_pack (memory_manager_allocator{}); + workpool pool_pack(memory_manager_allocator{}); workpool pool_unpack(memory_manager_allocator{}); - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] (int i) { - var[i] = i + v; - }); - } + RAJA::forall(range_segment(0, var_size), + [=](int i) { var[i] = i + v; }); + } - // _halo_exchange_seq_workgroup_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_seq_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_pack.enqueue(range_segment(0, len), [=] (int i) { - buffer[i] = var[list[i]]; - }); + pool_pack.enqueue(range_segment(0, len), + [=](int i) { buffer[i] = var[list[i]]; }); - buffer += len; + buffer += len; + } } - } - workgroup group_pack = pool_pack.instantiate(); + workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); + worksite site_pack = group_pack.run(); - // send all messages - // _halo_exchange_seq_workgroup_packing_end + // send all messages + // _halo_exchange_seq_workgroup_packing_end - // _halo_exchange_seq_workgroup_unpacking_start - // recv all messages + // _halo_exchange_seq_workgroup_unpacking_start + // recv all messages - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), [=] (int i) { - var[list[i]] = buffer[i]; - }); + pool_unpack.enqueue(range_segment(0, len), + [=](int i) { var[list[i]] = buffer[i]; }); - buffer += len; + buffer += len; + } } - } - workgroup group_unpack = pool_unpack.instantiate(); - - worksite site_unpack = group_unpack.run(); - // _halo_exchange_seq_workgroup_unpacking_end + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + // _halo_exchange_seq_workgroup_unpacking_end } timer.stop(); @@ -597,28 +631,29 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate(buffers[l]); - } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Separate packing/unpacking loops using forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Separate packing/unpacking loops using forall + //----------------------------------------------------------------------------// { std::cout << "\n Running RAJA Openmp forall halo exchange...\n"; @@ -630,74 +665,78 @@ int main(int argc, char **argv) std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate(buffer_len); - } - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] (int i) { - var[i] = i + v; - }); - } + RAJA::forall(range_segment(0, var_size), + [=](int i) { var[i] = i + v; }); + } - // _halo_exchange_openmp_forall_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_openmp_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] (int i) { - buffer[i] = var[list[i]]; - }); + RAJA::forall(range_segment(0, len), [=](int i) { + buffer[i] = var[list[i]]; + }); - buffer += len; - } + buffer += len; + } - // send single message - } - // _halo_exchange_openmp_forall_packing_end + // send single message + } + // _halo_exchange_openmp_forall_packing_end - // _halo_exchange_openmp_forall_unpacking_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_openmp_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) + { - // recv single message + // recv single message - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] (int i) { - var[list[i]] = buffer[i]; - }); + RAJA::forall(range_segment(0, len), [=](int i) { + var[list[i]] = buffer[i]; + }); - buffer += len; + buffer += len; + } } - } - // _halo_exchange_openmp_forall_unpacking_end - + // _halo_exchange_openmp_forall_unpacking_end } timer.stop(); @@ -706,23 +745,24 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate(buffers[l]); - } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } -//----------------------------------------------------------------------------// -// RAJA::WorkGroup may allow effective parallelism across loops with Openmp. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::WorkGroup may allow effective parallelism across loops with Openmp. + //----------------------------------------------------------------------------// { std::cout << "\n Running RAJA OpenMP workgroup halo exchange...\n"; @@ -731,109 +771,111 @@ int main(int argc, char **argv) // _halo_exchange_openmp_workgroup_policies_start using forall_policy = RAJA::omp_parallel_for_exec; - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::omp_work, - RAJA::ordered, - RAJA::ragged_array_of_objects, - RAJA::indirect_function_call_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - int, - RAJA::xargs<>, - memory_manager_allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - int, - RAJA::xargs<>, - memory_manager_allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - int, - RAJA::xargs<>, - memory_manager_allocator >; + using workgroup_policy = + RAJA::WorkGroupPolicy; + + using workpool = RAJA::WorkPool, + memory_manager_allocator>; + + using workgroup = RAJA::WorkGroup, + memory_manager_allocator>; + + using worksite = RAJA::WorkSite, + memory_manager_allocator>; // _halo_exchange_openmp_workgroup_policies_end std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate(buffer_len); - } - workpool pool_pack (memory_manager_allocator{}); + workpool pool_pack(memory_manager_allocator{}); workpool pool_unpack(memory_manager_allocator{}); - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] (int i) { - var[i] = i + v; - }); - } + RAJA::forall(range_segment(0, var_size), + [=](int i) { var[i] = i + v; }); + } - // _halo_exchange_openmp_workgroup_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_openmp_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_pack.enqueue(range_segment(0, len), [=] (int i) { - buffer[i] = var[list[i]]; - }); + pool_pack.enqueue(range_segment(0, len), + [=](int i) { buffer[i] = var[list[i]]; }); - buffer += len; + buffer += len; + } } - } - workgroup group_pack = pool_pack.instantiate(); + workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); + worksite site_pack = group_pack.run(); - // send all messages - // _halo_exchange_openmp_workgroup_packing_end + // send all messages + // _halo_exchange_openmp_workgroup_packing_end - // _halo_exchange_openmp_workgroup_unpacking_start - // recv all messages + // _halo_exchange_openmp_workgroup_unpacking_start + // recv all messages - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), [=] (int i) { - var[list[i]] = buffer[i]; - }); + pool_unpack.enqueue(range_segment(0, len), + [=](int i) { var[list[i]] = buffer[i]; }); - buffer += len; + buffer += len; + } } - } - - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - // _halo_exchange_openmp_workgroup_unpacking_end + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + // _halo_exchange_openmp_workgroup_unpacking_end } timer.stop(); @@ -842,30 +884,31 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate(buffers[l]); - } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Separate packing/unpacking loops using forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Separate packing/unpacking loops using forall + //----------------------------------------------------------------------------// { std::cout << "\n Running RAJA Cuda forall halo exchange...\n"; @@ -873,25 +916,33 @@ int main(int argc, char **argv) std::vector cuda_vars(num_vars, nullptr); - std::vector cuda_pack_index_lists(num_neighbors, nullptr); - std::vector cuda_unpack_index_lists(num_neighbors, nullptr); + std::vector cuda_pack_index_lists(num_neighbors, nullptr); + std::vector cuda_unpack_index_lists(num_neighbors, nullptr); - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { cuda_vars[v] = memoryManager::allocate_gpu(var_size); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int pack_len = pack_index_list_lengths[l]; cuda_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); - cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault )); + cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l], + pack_index_lists[l], + pack_len * sizeof(int), + cudaMemcpyDefault)); int unpack_len = unpack_index_list_lengths[l]; cuda_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); - cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault )); + cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l], + unpack_index_lists[l], + unpack_len * sizeof(int), + cudaMemcpyDefault)); } - std::swap(vars, cuda_vars); - std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); std::swap(unpack_index_lists, cuda_unpack_index_lists); @@ -901,78 +952,83 @@ int main(int argc, char **argv) std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate_gpu(buffer_len); - } - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { - var[i] = i + v; - }); - } + RAJA::forall( + range_segment(0, var_size), + [=] RAJA_DEVICE(int i) { var[i] = i + v; }); + } - // _halo_exchange_cuda_forall_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_cuda_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { - buffer[i] = var[list[i]]; - }); + RAJA::forall( + range_segment(0, len), + [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; }); - buffer += len; - } + buffer += len; + } - cudaErrchk(cudaDeviceSynchronize()); + cudaErrchk(cudaDeviceSynchronize()); - // send single message - } - // _halo_exchange_cuda_forall_packing_end + // send single message + } + // _halo_exchange_cuda_forall_packing_end - // _halo_exchange_cuda_forall_unpacking_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_cuda_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) + { - // recv single message + // recv single message - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { - var[list[i]] = buffer[i]; - }); + RAJA::forall( + range_segment(0, len), + [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; }); - buffer += len; + buffer += len; + } } - } - - cudaErrchk(cudaDeviceSynchronize()); - // _halo_exchange_cuda_forall_unpacking_end + cudaErrchk(cudaDeviceSynchronize()); + // _halo_exchange_cuda_forall_unpacking_end } timer.stop(); @@ -981,39 +1037,43 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(buffers[l]); - } - std::swap(vars, cuda_vars); - std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); std::swap(unpack_index_lists, cuda_unpack_index_lists); - for (int v = 0; v < num_vars; ++v) { - cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault )); + for (int v = 0; v < num_vars; ++v) + { + cudaErrchk(cudaMemcpy( + vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault)); memoryManager::deallocate_gpu(cuda_vars[v]); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(cuda_pack_index_lists[l]); memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]); } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } -//----------------------------------------------------------------------------// -// RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::WorkGroup with cuda_work allows deferred kernel fusion execution + //----------------------------------------------------------------------------// { std::cout << "\n Running RAJA Cuda workgroup halo exchange...\n"; @@ -1021,138 +1081,145 @@ int main(int argc, char **argv) std::vector cuda_vars(num_vars, nullptr); - std::vector cuda_pack_index_lists(num_neighbors, nullptr); - std::vector cuda_unpack_index_lists(num_neighbors, nullptr); + std::vector cuda_pack_index_lists(num_neighbors, nullptr); + std::vector cuda_unpack_index_lists(num_neighbors, nullptr); - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { cuda_vars[v] = memoryManager::allocate_gpu(var_size); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int pack_len = pack_index_list_lengths[l]; cuda_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); - cudaErrchk(cudaMemcpy( cuda_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), cudaMemcpyDefault )); + cudaErrchk(cudaMemcpy(cuda_pack_index_lists[l], + pack_index_lists[l], + pack_len * sizeof(int), + cudaMemcpyDefault)); int unpack_len = unpack_index_list_lengths[l]; cuda_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); - cudaErrchk(cudaMemcpy( cuda_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), cudaMemcpyDefault )); + cudaErrchk(cudaMemcpy(cuda_unpack_index_lists[l], + unpack_index_lists[l], + unpack_len * sizeof(int), + cudaMemcpyDefault)); } - std::swap(vars, cuda_vars); - std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); std::swap(unpack_index_lists, cuda_unpack_index_lists); // _halo_exchange_cuda_workgroup_policies_start using forall_policy = RAJA::cuda_exec_async; - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::cuda_work_async, - RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects, - RAJA::indirect_function_call_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; + using workgroup_policy = RAJA::WorkGroupPolicy< + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch>; + + using workpool = RAJA:: + WorkPool, pinned_allocator>; + + using workgroup = RAJA:: + WorkGroup, pinned_allocator>; + + using worksite = RAJA:: + WorkSite, pinned_allocator>; // _halo_exchange_cuda_workgroup_policies_end std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate_gpu(buffer_len); - } - workpool pool_pack (pinned_allocator{}); + workpool pool_pack(pinned_allocator{}); workpool pool_unpack(pinned_allocator{}); - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { - var[i] = i + v; - }); - } + RAJA::forall( + range_segment(0, var_size), + [=] RAJA_DEVICE(int i) { var[i] = i + v; }); + } - // _halo_exchange_cuda_workgroup_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_cuda_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { - buffer[i] = var[list[i]]; - }); + pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) { + buffer[i] = var[list[i]]; + }); - buffer += len; + buffer += len; + } } - } - workgroup group_pack = pool_pack.instantiate(); + workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); + worksite site_pack = group_pack.run(); - cudaErrchk(cudaDeviceSynchronize()); + cudaErrchk(cudaDeviceSynchronize()); - // send all messages - // _halo_exchange_cuda_workgroup_packing_end + // send all messages + // _halo_exchange_cuda_workgroup_packing_end - // _halo_exchange_cuda_workgroup_unpacking_start - // recv all messages + // _halo_exchange_cuda_workgroup_unpacking_start + // recv all messages - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { - var[list[i]] = buffer[i]; - }); + pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) { + var[list[i]] = buffer[i]; + }); - buffer += len; + buffer += len; + } } - } - - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); + workgroup group_unpack = pool_unpack.instantiate(); - cudaErrchk(cudaDeviceSynchronize()); - // _halo_exchange_cuda_workgroup_unpacking_end + worksite site_unpack = group_unpack.run(); + cudaErrchk(cudaDeviceSynchronize()); + // _halo_exchange_cuda_workgroup_unpacking_end } timer.stop(); @@ -1161,46 +1228,50 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(buffers[l]); - } - std::swap(vars, cuda_vars); - std::swap(pack_index_lists, cuda_pack_index_lists); + std::swap(vars, cuda_vars); + std::swap(pack_index_lists, cuda_pack_index_lists); std::swap(unpack_index_lists, cuda_unpack_index_lists); - for (int v = 0; v < num_vars; ++v) { - cudaErrchk(cudaMemcpy( vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault )); + for (int v = 0; v < num_vars; ++v) + { + cudaErrchk(cudaMemcpy( + vars[v], cuda_vars[v], var_size * sizeof(double), cudaMemcpyDefault)); memoryManager::deallocate_gpu(cuda_vars[v]); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(cuda_pack_index_lists[l]); memoryManager::deallocate_gpu(cuda_unpack_index_lists[l]); } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Separate packing/unpacking loops using forall -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Separate packing/unpacking loops using forall + //----------------------------------------------------------------------------// { std::cout << "\n Running RAJA Hip forall halo exchange...\n"; @@ -1208,25 +1279,33 @@ int main(int argc, char **argv) std::vector hip_vars(num_vars, nullptr); - std::vector hip_pack_index_lists(num_neighbors, nullptr); - std::vector hip_unpack_index_lists(num_neighbors, nullptr); + std::vector hip_pack_index_lists(num_neighbors, nullptr); + std::vector hip_unpack_index_lists(num_neighbors, nullptr); - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { hip_vars[v] = memoryManager::allocate_gpu(var_size); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int pack_len = pack_index_list_lengths[l]; hip_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); - hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(hip_pack_index_lists[l], + pack_index_lists[l], + pack_len * sizeof(int), + hipMemcpyHostToDevice)); int unpack_len = unpack_index_list_lengths[l]; hip_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); - hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(hip_unpack_index_lists[l], + unpack_index_lists[l], + unpack_len * sizeof(int), + hipMemcpyHostToDevice)); } - std::swap(vars, hip_vars); - std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); std::swap(unpack_index_lists, hip_unpack_index_lists); @@ -1236,78 +1315,83 @@ int main(int argc, char **argv) std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate_gpu(buffer_len); - } - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { - var[i] = i + v; - }); - } + RAJA::forall( + range_segment(0, var_size), + [=] RAJA_DEVICE(int i) { var[i] = i + v; }); + } - // _halo_exchange_hip_forall_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_hip_forall_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { - buffer[i] = var[list[i]]; - }); + RAJA::forall( + range_segment(0, len), + [=] RAJA_DEVICE(int i) { buffer[i] = var[list[i]]; }); - buffer += len; - } + buffer += len; + } - hipErrchk(hipDeviceSynchronize()); + hipErrchk(hipDeviceSynchronize()); - // send single message - } - // _halo_exchange_hip_forall_packing_end + // send single message + } + // _halo_exchange_hip_forall_packing_end - // _halo_exchange_hip_forall_unpacking_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_hip_forall_unpacking_start + for (int l = 0; l < num_neighbors; ++l) + { - // recv single message + // recv single message - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, len), [=] RAJA_DEVICE (int i) { - var[list[i]] = buffer[i]; - }); + RAJA::forall( + range_segment(0, len), + [=] RAJA_DEVICE(int i) { var[list[i]] = buffer[i]; }); - buffer += len; + buffer += len; + } } - } - - hipErrchk(hipDeviceSynchronize()); - // _halo_exchange_hip_forall_unpacking_end + hipErrchk(hipDeviceSynchronize()); + // _halo_exchange_hip_forall_unpacking_end } timer.stop(); @@ -1316,179 +1400,193 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(buffers[l]); - } - std::swap(vars, hip_vars); - std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); std::swap(unpack_index_lists, hip_unpack_index_lists); - for (int v = 0; v < num_vars; ++v) { - hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost )); + for (int v = 0; v < num_vars; ++v) + { + hipErrchk(hipMemcpy(vars[v], + hip_vars[v], + var_size * sizeof(double), + hipMemcpyDeviceToHost)); memoryManager::deallocate_gpu(hip_vars[v]); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(hip_pack_index_lists[l]); memoryManager::deallocate_gpu(hip_unpack_index_lists[l]); } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } #if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) -//----------------------------------------------------------------------------// -// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution + //----------------------------------------------------------------------------// { - std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo exchange...\n"; + std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo " + "exchange...\n"; double minCycle = std::numeric_limits::max(); std::vector hip_vars(num_vars, nullptr); - std::vector hip_pack_index_lists(num_neighbors, nullptr); - std::vector hip_unpack_index_lists(num_neighbors, nullptr); + std::vector hip_pack_index_lists(num_neighbors, nullptr); + std::vector hip_unpack_index_lists(num_neighbors, nullptr); - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { hip_vars[v] = memoryManager::allocate_gpu(var_size); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int pack_len = pack_index_list_lengths[l]; hip_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); - hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(hip_pack_index_lists[l], + pack_index_lists[l], + pack_len * sizeof(int), + hipMemcpyHostToDevice)); int unpack_len = unpack_index_list_lengths[l]; hip_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); - hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(hip_unpack_index_lists[l], + unpack_index_lists[l], + unpack_len * sizeof(int), + hipMemcpyHostToDevice)); } - std::swap(vars, hip_vars); - std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); std::swap(unpack_index_lists, hip_unpack_index_lists); // _halo_exchange_hip_workgroup_policies_start using forall_policy = RAJA::hip_exec_async; - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::hip_work_async, - RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects, - RAJA::indirect_function_call_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; + using workgroup_policy = RAJA::WorkGroupPolicy< + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch>; + + using workpool = RAJA:: + WorkPool, pinned_allocator>; + + using workgroup = RAJA:: + WorkGroup, pinned_allocator>; + + using worksite = RAJA:: + WorkSite, pinned_allocator>; // _halo_exchange_hip_workgroup_policies_end std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate_gpu(buffer_len); - } - workpool pool_pack (pinned_allocator{}); + workpool pool_pack(pinned_allocator{}); workpool pool_unpack(pinned_allocator{}); - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { - var[i] = i + v; - }); - } + RAJA::forall( + range_segment(0, var_size), + [=] RAJA_DEVICE(int i) { var[i] = i + v; }); + } - // _halo_exchange_hip_workgroup_packing_start - for (int l = 0; l < num_neighbors; ++l) { + // _halo_exchange_hip_workgroup_packing_start + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { - buffer[i] = var[list[i]]; - }); + pool_pack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) { + buffer[i] = var[list[i]]; + }); - buffer += len; + buffer += len; + } } - } - workgroup group_pack = pool_pack.instantiate(); + workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); + worksite site_pack = group_pack.run(); - hipErrchk(hipDeviceSynchronize()); + hipErrchk(hipDeviceSynchronize()); - // send all messages - // _halo_exchange_hip_workgroup_packing_end + // send all messages + // _halo_exchange_hip_workgroup_packing_end - // _halo_exchange_hip_workgroup_unpacking_start - // recv all messages + // _halo_exchange_hip_workgroup_unpacking_start + // recv all messages - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE (int i) { - var[list[i]] = buffer[i]; - }); + pool_unpack.enqueue(range_segment(0, len), [=] RAJA_DEVICE(int i) { + var[list[i]] = buffer[i]; + }); - buffer += len; + buffer += len; + } } - } - workgroup group_unpack = pool_unpack.instantiate(); + workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - - hipErrchk(hipDeviceSynchronize()); - // _halo_exchange_hip_workgroup_unpacking_end + worksite site_unpack = group_unpack.run(); + hipErrchk(hipDeviceSynchronize()); + // _halo_exchange_hip_workgroup_unpacking_end } timer.stop(); @@ -1497,188 +1595,200 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(buffers[l]); - } - std::swap(vars, hip_vars); - std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); std::swap(unpack_index_lists, hip_unpack_index_lists); - for (int v = 0; v < num_vars; ++v) { - hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost )); + for (int v = 0; v < num_vars; ++v) + { + hipErrchk(hipMemcpy(vars[v], + hip_vars[v], + var_size * sizeof(double), + hipMemcpyDeviceToHost)); memoryManager::deallocate_gpu(hip_vars[v]); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(hip_pack_index_lists[l]); memoryManager::deallocate_gpu(hip_unpack_index_lists[l]); } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } #endif -//----------------------------------------------------------------------------// -// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution + //----------------------------------------------------------------------------// { - std::cout << "\n Running RAJA Hip direct dispatch workgroup halo exchange...\n"; + std::cout << "\n Running RAJA Hip direct dispatch workgroup halo " + "exchange...\n"; double minCycle = std::numeric_limits::max(); std::vector hip_vars(num_vars, nullptr); - std::vector hip_pack_index_lists(num_neighbors, nullptr); - std::vector hip_unpack_index_lists(num_neighbors, nullptr); + std::vector hip_pack_index_lists(num_neighbors, nullptr); + std::vector hip_unpack_index_lists(num_neighbors, nullptr); - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { hip_vars[v] = memoryManager::allocate_gpu(var_size); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int pack_len = pack_index_list_lengths[l]; hip_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); - hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(hip_pack_index_lists[l], + pack_index_lists[l], + pack_len * sizeof(int), + hipMemcpyHostToDevice)); int unpack_len = unpack_index_list_lengths[l]; hip_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); - hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(hip_unpack_index_lists[l], + unpack_index_lists[l], + unpack_len * sizeof(int), + hipMemcpyHostToDevice)); } - std::swap(vars, hip_vars); - std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); std::swap(unpack_index_lists, hip_unpack_index_lists); using forall_policy = RAJA::hip_exec_async; - struct Packer { + struct Packer + { double* buffer; double* var; int* list; - RAJA_DEVICE void operator() (int i) const { - buffer[i] = var[list[i]]; - } + RAJA_DEVICE void operator()(int i) const { buffer[i] = var[list[i]]; } }; - struct UnPacker { + struct UnPacker + { double* buffer; double* var; int* list; - RAJA_DEVICE void operator()(int i) const { - var[list[i]] = buffer[i]; - } + RAJA_DEVICE void operator()(int i) const { var[list[i]] = buffer[i]; } }; - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::hip_work_async, - RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects, - RAJA::direct_dispatch, - camp::list> - >; - - using workpool = RAJA::WorkPool< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - int, - RAJA::xargs<>, - pinned_allocator >; + using workgroup_policy = RAJA::WorkGroupPolicy< + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::direct_dispatch, + camp::list>>; + + using workpool = RAJA:: + WorkPool, pinned_allocator>; + + using workgroup = RAJA:: + WorkGroup, pinned_allocator>; + + using worksite = RAJA:: + WorkSite, pinned_allocator>; std::vector buffers(num_neighbors, nullptr); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { int buffer_len = num_vars * pack_index_list_lengths[l]; buffers[l] = memoryManager::allocate_gpu(buffer_len); - } - workpool pool_pack (pinned_allocator{}); + workpool pool_pack(pinned_allocator{}); workpool pool_unpack(pinned_allocator{}); - for (int c = 0; c < num_cycles; ++c ) { + for (int c = 0; c < num_cycles; ++c) + { timer.start(); { - // set vars - for (int v = 0; v < num_vars; ++v) { + // set vars + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { - var[i] = i + v; - }); - } + RAJA::forall( + range_segment(0, var_size), + [=] RAJA_DEVICE(int i) { var[i] = i + v; }); + } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = pack_index_lists[l]; - int len = pack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; - // pack - for (int v = 0; v < num_vars; ++v) { + // pack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); - buffer += len; + buffer += len; + } } - } - workgroup group_pack = pool_pack.instantiate(); + workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); + worksite site_pack = group_pack.run(); - hipErrchk(hipDeviceSynchronize()); + hipErrchk(hipDeviceSynchronize()); - // send all messages + // send all messages - // recv all messages + // recv all messages - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { - double* buffer = buffers[l]; - int* list = unpack_index_lists[l]; - int len = unpack_index_list_lengths[l]; + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; - // unpack - for (int v = 0; v < num_vars; ++v) { + // unpack + for (int v = 0; v < num_vars; ++v) + { - double* var = vars[v]; + double* var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + pool_unpack.enqueue(range_segment(0, len), + UnPacker{buffer, var, list}); - buffer += len; + buffer += len; + } } - } - workgroup group_unpack = pool_unpack.instantiate(); + workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - - hipErrchk(hipDeviceSynchronize()); + worksite site_unpack = group_unpack.run(); + hipErrchk(hipDeviceSynchronize()); } timer.stop(); @@ -1687,45 +1797,52 @@ int main(int argc, char **argv) timer.reset(); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(buffers[l]); - } - std::swap(vars, hip_vars); - std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); std::swap(unpack_index_lists, hip_unpack_index_lists); - for (int v = 0; v < num_vars; ++v) { - hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost )); + for (int v = 0; v < num_vars; ++v) + { + hipErrchk(hipMemcpy(vars[v], + hip_vars[v], + var_size * sizeof(double), + hipMemcpyDeviceToHost)); memoryManager::deallocate_gpu(hip_vars[v]); } - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate_gpu(hip_pack_index_lists[l]); memoryManager::deallocate_gpu(hip_unpack_index_lists[l]); } - std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + std::cout << "\tmin cycle run time : " << minCycle << " seconds" + << std::endl; // check results against reference copy checkResult(vars, vars_ref, var_size, num_vars); - //printResult(vars, var_size, num_vars); + // printResult(vars, var_size, num_vars); } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// - for (int v = 0; v < num_vars; ++v) { + // + // Clean up. + // + for (int v = 0; v < num_vars; ++v) + { memoryManager::deallocate(vars[v]); memoryManager::deallocate(vars_ref[v]); } @@ -1743,20 +1860,30 @@ int main(int argc, char **argv) // // Function to compare result to reference and report P/F. // -void checkResult(std::vector const& vars, std::vector const& vars_ref, - int var_size, int num_vars) +void checkResult(std::vector const& vars, + std::vector const& vars_ref, + int var_size, + int num_vars) { bool correct = true; - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { double* var = vars[v]; double* var_ref = vars_ref[v]; - for (int i = 0; i < var_size; i++) { - if ( var[i] != var_ref[i] ) { correct = false; } + for (int i = 0; i < var_size; i++) + { + if (var[i] != var_ref[i]) + { + correct = false; + } } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -1767,9 +1894,11 @@ void checkResult(std::vector const& vars, std::vector const& v void printResult(std::vector const& vars, int var_size, int num_vars) { std::cout << std::endl; - for (int v = 0; v < num_vars; ++v) { + for (int v = 0; v < num_vars; ++v) + { double* var = vars[v]; - for (int i = 0; i < var_size; i++) { + for (int i = 0; i < var_size; i++) + { std::cout << "result[" << i << "] = " << var[i] << std::endl; } } @@ -1791,119 +1920,202 @@ struct Extent // Function to generate index lists for packing. // void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const int halo_width, const int* grid_dims) + std::vector& pack_index_list_lengths, + const int halo_width, + const int* grid_dims) { std::vector pack_index_list_extents(num_neighbors); // faces - pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[0] = Extent{halo_width, + halo_width + halo_width, + halo_width, + grid_dims[1] + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[1] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + halo_width, + grid_dims[1] + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[2] = Extent{halo_width, + grid_dims[0] + halo_width, + halo_width, + halo_width + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[3] = Extent{halo_width, + grid_dims[0] + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[4] = Extent{halo_width, + grid_dims[0] + halo_width, + halo_width, + grid_dims[1] + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[5] = Extent{halo_width, + grid_dims[0] + halo_width, + halo_width, + grid_dims[1] + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; // edges - pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[6] = Extent{halo_width, + halo_width + halo_width, + halo_width, + halo_width + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[7] = Extent{halo_width, + halo_width + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[8] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + halo_width, + halo_width + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[9] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + halo_width, + grid_dims[2] + halo_width}; + pack_index_list_extents[10] = Extent{halo_width, + halo_width + halo_width, + halo_width, + grid_dims[1] + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[11] = Extent{halo_width, + halo_width + halo_width, + halo_width, + grid_dims[1] + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; + pack_index_list_extents[12] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + halo_width, + grid_dims[1] + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[13] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + halo_width, + grid_dims[1] + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; + pack_index_list_extents[14] = Extent{halo_width, + grid_dims[0] + halo_width, + halo_width, + halo_width + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[15] = Extent{halo_width, + grid_dims[0] + halo_width, + halo_width, + halo_width + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; + pack_index_list_extents[16] = Extent{halo_width, + grid_dims[0] + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[17] = Extent{halo_width, + grid_dims[0] + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; // corners - pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[18] = Extent{halo_width, + halo_width + halo_width, + halo_width, + halo_width + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[19] = Extent{halo_width, + halo_width + halo_width, + halo_width, + halo_width + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; + pack_index_list_extents[20] = Extent{halo_width, + halo_width + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[21] = Extent{halo_width, + halo_width + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; + pack_index_list_extents[22] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + halo_width, + halo_width + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[23] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + halo_width, + halo_width + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; + pack_index_list_extents[24] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + halo_width, + halo_width + halo_width}; + pack_index_list_extents[25] = Extent{grid_dims[0], + grid_dims[0] + halo_width, + grid_dims[1], + grid_dims[1] + halo_width, + grid_dims[2], + grid_dims[2] + halo_width}; const int grid_i_stride = 1; - const int grid_j_stride = grid_dims[0] + 2*halo_width; - const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); + const int grid_j_stride = grid_dims[0] + 2 * halo_width; + const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2 * halo_width); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { Extent extent = pack_index_list_extents[l]; pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; + (extent.k_max - extent.k_min); - pack_index_lists[l] = memoryManager::allocate(pack_index_list_lengths[l]); + pack_index_lists[l] = + memoryManager::allocate(pack_index_list_lengths[l]); int* pack_list = pack_index_lists[l]; int list_idx = 0; - for (int kk = extent.k_min; kk < extent.k_max; ++kk) { - for (int jj = extent.j_min; jj < extent.j_max; ++jj) { - for (int ii = extent.i_min; ii < extent.i_max; ++ii) { + for (int kk = extent.k_min; kk < extent.k_max; ++kk) + { + for (int jj = extent.j_min; jj < extent.j_max; ++jj) + { + for (int ii = extent.i_min; ii < extent.i_max; ++ii) + { - int pack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; + int pack_idx = + ii * grid_i_stride + jj * grid_j_stride + kk * grid_k_stride; pack_list[list_idx] = pack_idx; @@ -1919,7 +2131,8 @@ void create_pack_lists(std::vector& pack_index_lists, // void destroy_pack_lists(std::vector& pack_index_lists) { - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate(pack_index_lists[l]); } } @@ -1928,119 +2141,187 @@ void destroy_pack_lists(std::vector& pack_index_lists) // // Function to generate index lists for unpacking. // -void create_unpack_lists(std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, - const int halo_width, const int* grid_dims) +void create_unpack_lists(std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const int halo_width, + const int* grid_dims) { std::vector unpack_index_list_extents(num_neighbors); // faces - unpack_index_list_extents[0] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[0] = Extent{0, + halo_width, + halo_width, + grid_dims[1] + halo_width, + halo_width, + grid_dims[2] + halo_width}; + unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + halo_width, + grid_dims[1] + halo_width, + halo_width, + grid_dims[2] + halo_width}; + unpack_index_list_extents[2] = Extent{halo_width, + grid_dims[0] + halo_width, + 0, + halo_width, + halo_width, + grid_dims[2] + halo_width}; + unpack_index_list_extents[3] = Extent{halo_width, + grid_dims[0] + halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + halo_width, + grid_dims[2] + halo_width}; + unpack_index_list_extents[4] = Extent{halo_width, + grid_dims[0] + halo_width, + halo_width, + grid_dims[1] + halo_width, + 0, + halo_width}; + unpack_index_list_extents[5] = Extent{halo_width, + grid_dims[0] + halo_width, + halo_width, + grid_dims[1] + halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; // edges - unpack_index_list_extents[6] = Extent{0 , halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[7] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[10] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[11] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[6] = Extent{ + 0, halo_width, 0, halo_width, halo_width, grid_dims[2] + halo_width}; + unpack_index_list_extents[7] = Extent{0, + halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + halo_width, + grid_dims[2] + halo_width}; + unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + 0, + halo_width, + halo_width, + grid_dims[2] + halo_width}; + unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + halo_width, + grid_dims[2] + halo_width}; + unpack_index_list_extents[10] = Extent{ + 0, halo_width, halo_width, grid_dims[1] + halo_width, 0, halo_width}; + unpack_index_list_extents[11] = Extent{0, + halo_width, + halo_width, + grid_dims[1] + halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; + unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + halo_width, + grid_dims[1] + halo_width, + 0, + halo_width}; + unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + halo_width, + grid_dims[1] + halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; + unpack_index_list_extents[14] = Extent{ + halo_width, grid_dims[0] + halo_width, 0, halo_width, 0, halo_width}; + unpack_index_list_extents[15] = Extent{halo_width, + grid_dims[0] + halo_width, + 0, + halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; + unpack_index_list_extents[16] = Extent{halo_width, + grid_dims[0] + halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + 0, + halo_width}; + unpack_index_list_extents[17] = Extent{halo_width, + grid_dims[0] + halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; // corners - unpack_index_list_extents[18] = Extent{0 , halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[19] = Extent{0 , halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[20] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[21] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[18] = + Extent{0, halo_width, 0, halo_width, 0, halo_width}; + unpack_index_list_extents[19] = Extent{0, + halo_width, + 0, + halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; + unpack_index_list_extents[20] = Extent{0, + halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + 0, + halo_width}; + unpack_index_list_extents[21] = Extent{0, + halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; + unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + 0, + halo_width, + 0, + halo_width}; + unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + 0, + halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; + unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + 0, + halo_width}; + unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, + grid_dims[0] + 2 * halo_width, + grid_dims[1] + halo_width, + grid_dims[1] + 2 * halo_width, + grid_dims[2] + halo_width, + grid_dims[2] + 2 * halo_width}; const int grid_i_stride = 1; - const int grid_j_stride = grid_dims[0] + 2*halo_width; - const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); + const int grid_j_stride = grid_dims[0] + 2 * halo_width; + const int grid_k_stride = grid_j_stride * (grid_dims[1] + 2 * halo_width); - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { Extent extent = unpack_index_list_extents[l]; unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; + (extent.k_max - extent.k_min); - unpack_index_lists[l] = memoryManager::allocate(unpack_index_list_lengths[l]); + unpack_index_lists[l] = + memoryManager::allocate(unpack_index_list_lengths[l]); int* unpack_list = unpack_index_lists[l]; int list_idx = 0; - for (int kk = extent.k_min; kk < extent.k_max; ++kk) { - for (int jj = extent.j_min; jj < extent.j_max; ++jj) { - for (int ii = extent.i_min; ii < extent.i_max; ++ii) { + for (int kk = extent.k_min; kk < extent.k_max; ++kk) + { + for (int jj = extent.j_min; jj < extent.j_max; ++jj) + { + for (int ii = extent.i_min; ii < extent.i_max; ++ii) + { - int unpack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; + int unpack_idx = + ii * grid_i_stride + jj * grid_j_stride + kk * grid_k_stride; unpack_list[list_idx] = unpack_idx; @@ -2056,7 +2337,8 @@ void create_unpack_lists(std::vector& unpack_index_lists, std::vector // void destroy_unpack_lists(std::vector& unpack_index_lists) { - for (int l = 0; l < num_neighbors; ++l) { + for (int l = 0; l < num_neighbors; ++l) + { memoryManager::deallocate(unpack_index_lists[l]); } } diff --git a/examples/tut_launch_basic.cpp b/examples/tut_launch_basic.cpp index 96a2ffe2f0..fa354d2612 100644 --- a/examples/tut_launch_basic.cpp +++ b/examples/tut_launch_basic.cpp @@ -31,7 +31,7 @@ * the example below choses a sequential * execution space and either a CUDA or HIP * execution device execution space. -*/ + */ // __host_launch_start using host_launch = RAJA::seq_launch_t; @@ -45,12 +45,12 @@ using device_launch = RAJA::cuda_launch_t; using device_launch = RAJA::hip_launch_t; #endif -using launch_policy = RAJA::LaunchPolicy< - host_launch +using launch_policy = RAJA::LaunchPolicy; + >; /* * RAJA launch exposes a thread/block programming model @@ -64,69 +64,73 @@ using launch_policy = RAJA::LaunchPolicy< * On the host the loops expands to standard C style for loops. */ -using teams_x = RAJA::LoopPolicy< - RAJA::seq_exec +using teams_x = RAJA::LoopPolicy; + >; -using teams_y = RAJA::LoopPolicy< - RAJA::seq_exec +using teams_y = RAJA::LoopPolicy; + >; using threads_x = RAJA::LoopPolicy; + >; using threads_y = RAJA::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) __global__ void gpuKernel() { - //Equivalent CUDA/HIP style thread/block mapping - // _device_loop_start - {int by = blockIdx.y; - {int bx = blockIdx.x; - - {int ty = threadIdx.y; - {int tx = blockIdx.x; - - printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d block_by %d \n", - tx, ty, bx, by); - + // Equivalent CUDA/HIP style thread/block mapping + // _device_loop_start + { + int by = blockIdx.y; + { + int bx = blockIdx.x; + + { + int ty = threadIdx.y; + { + int tx = blockIdx.x; + + printf("device-iter: threadIdx_tx %d threadIdx_ty %d block_bx %d " + "block_by %d \n", + tx, + ty, + bx, + by); } } - } } // _device_loop_end @@ -142,78 +146,102 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device"); + if (argc != 2) + { + RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic " + "device"); } -// -// Run time policy section is demonstrated in this example by specifying -// kernel exection space as a command line argument (host or device). -// Example usage ./tut_launch_basic host or ./tut_launch_basic device -// + // + // Run time policy section is demonstrated in this example by specifying + // kernel exection space as a command line argument (host or device). + // Example usage ./tut_launch_basic host or ./tut_launch_basic device + // std::string exec_space = argv[1]; - if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device"); + if (!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0)) + { + RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic " + "device"); return 0; } RAJA::ExecPlace select_cpu_or_gpu; - if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams on the host \n"); } - if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams on the device \n"); } + if (exec_space.compare("host") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::HOST; + printf("Running RAJA-Teams on the host \n"); + } + if (exec_space.compare("device") == 0) + { + select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; + printf("Running RAJA-Teams on the device \n"); + } -// -// The following three kernels illustrate loop based parallelism -// based on nested for loops. For correctness team and thread loops -// make the assumption that all work inside can be done -// concurrently. -// + // + // The following three kernels illustrate loop based parallelism + // based on nested for loops. For correctness team and thread loops + // make the assumption that all work inside can be done + // concurrently. + // // __compute_grid_start - const int Nteams = 2; + const int Nteams = 2; const int Nthreads = 2; // __compute_grid_end - RAJA::launch(select_cpu_or_gpu, - RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams), - RAJA::Threads(Nthreads,Nthreads)), - - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - // _team_loops_start - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nteams), [&] (int by) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nteams), [&] (int bx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nthreads), [&] (int ty) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nthreads), [&] (int tx) { - printf("RAJA Teams: threadId_x %d threadId_y %d teamId_x %d teamId_y %d \n", - tx, ty, bx, by); - - - }); - }); - - }); - }); - // _team_loops_end - - }); - - //Equivalent C style loops - if(select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + RAJA::launch( + select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(Nteams, Nteams), + RAJA::Threads(Nthreads, Nthreads)), + + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + // _team_loops_start + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, Nteams), [&](int by) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, Nteams), [&](int bx) { + RAJA::loop( + ctx, + RAJA::TypedRangeSegment(0, Nthreads), + [&](int ty) { + RAJA::loop( + ctx, + RAJA::TypedRangeSegment(0, Nthreads), + [&](int tx) { + printf("RAJA Teams: threadId_x %d threadId_y " + "%d teamId_x %d teamId_y %d \n", + tx, + ty, + bx, + by); + }); + }); + }); + }); + // _team_loops_end + }); + + // Equivalent C style loops + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) + { // _c_style_loops_start - for (int by=0; by>>(); cudaDeviceSynchronize(); #endif #if defined(RAJA_ENABLE_HIP) - if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) hipLaunchKernelGGL((gpuKernel), dim3(griddim), dim3(blockdim), 0, 0); hipDeviceSynchronize(); #endif diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp index e939d96dbb..53666d20e6 100644 --- a/examples/tut_matrix-multiply.cpp +++ b/examples/tut_matrix-multiply.cpp @@ -64,9 +64,11 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; - if ( row < N && col < N ) { + if (row < N && col < N) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } @@ -79,7 +81,7 @@ __global__ void matMultKernel(int N, double* C, double* A, double* B) // Functions for checking results // template -void checkResult(T *C, int N); +void checkResult(T* C, int N); template void checkResult(RAJA::View> Cview, int N); @@ -88,262 +90,256 @@ void checkResult(RAJA::View> Cview, int N); // Functions for printing results // template -void printResult(T *C, int N); +void printResult(T* C, int N); template void printResult(RAJA::View> Cview, int N); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix multiplication example...\n"; -// -// Define num rows/cols in matrix -// + // + // Define num rows/cols in matrix + // const int N = 1000; -//const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE; + // const int N = CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE; -// -// Allocate and initialize matrix data. -// - double *A = memoryManager::allocate(N * N); - double *B = memoryManager::allocate(N * N); - double *C = memoryManager::allocate(N * N); - - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + // + // Allocate and initialize matrix data. + // + double* A = memoryManager::allocate(N * N); + double* B = memoryManager::allocate(N * N); + double* C = memoryManager::allocate(N * N); + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { A(row, col) = row; B(row, col) = col; } } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix multiplication...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_cstyle_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += A(row, k) * B(k, col); } C(row, col) = dot; - } } // _matmult_cstyle_end checkResult(C, N); -//printResult(C, N); + // printResult(C, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// We define RAJA range segments to define the ranges of -// row, column, and dot-product loops for RAJA variants -// + // + // We define RAJA range segments to define the ranges of + // row, column, and dot-product loops for RAJA variants + // // _matmult_ranges_start RAJA::TypedRangeSegment row_range(0, N); RAJA::TypedRangeSegment col_range(0, N); RAJA::TypedRangeSegment dot_range(0, N); // _matmult_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// For the RAJA implementations of matrix multiplication, we -// use RAJA 'View' objects to access the matrix data. A RAJA view -// holds a pointer to a data array and enables multi-dimensional indexing -// into that data, similar to the macros we defined above. -// + // + // For the RAJA implementations of matrix multiplication, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into that data, similar to the macros we defined above. + // // _matmult_views_start RAJA::View> Aview(A, N, N); RAJA::View> Bview(B, N, N); RAJA::View> Cview(C, N, N); // _matmult_views_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// In the next few examples, we show ways that we can use RAJA::forall -// statements for the matrix multiplication kernel. This usage is not -// recommended for performance reasons. Specifically, it limits the amount -// of parallelism that can be exposed to less than is possible. We show -// this usage here, to make this point clear. Later in this file, we -// introduce RAJA nested loop abstractions and show that we can extract all -// available parallelism. -// -// -// In the first RAJA implementation, we replace the outer 'row' loop -// with a RAJA::forall statement. The lambda expression contains the -// inner loops. -// + // + // In the next few examples, we show ways that we can use RAJA::forall + // statements for the matrix multiplication kernel. This usage is not + // recommended for performance reasons. Specifically, it limits the amount + // of parallelism that can be exposed to less than is possible. We show + // this usage here, to make this point clear. Later in this file, we + // introduce RAJA nested loop abstractions and show that we can extract all + // available parallelism. + // + // + // In the first RAJA implementation, we replace the outer 'row' loop + // with a RAJA::forall statement. The lambda expression contains the + // inner loops. + // -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential mat-mult (RAJA-row)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_outerforall_start - RAJA::forall( row_range, [=](int row) { - - for (int col = 0; col < N; ++col) { + RAJA::forall(row_range, [=](int row) { + for (int col = 0; col < N; ++col) + { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += Aview(row, k) * Bview(k, col); } Cview(row, col) = dot; - } - }); // _matmult_outerforall_end checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Next, we replace the outer 'row' loop and the inner 'col' loop -// with RAJA::forall statements. This will also work with parallel -// execution policies, such as OpenMP and CUDA, with caveats and -// restrictions. -// -// However, nesting RAJA::forall calls like this is not recommended as -// it limits the ability to expose parallelism and flexibility for -// implementation alternatives. -// + // + // Next, we replace the outer 'row' loop and the inner 'col' loop + // with RAJA::forall statements. This will also work with parallel + // execution policies, such as OpenMP and CUDA, with caveats and + // restrictions. + // + // However, nesting RAJA::forall calls like this is not recommended as + // it limits the ability to expose parallelism and flexibility for + // implementation alternatives. + // std::cout << "\n Running sequential mat-mult (RAJA-row, RAJA-col)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_nestedforall_start - RAJA::forall( row_range, [=](int row) { - - RAJA::forall( col_range, [=](int col) { - + RAJA::forall(row_range, [=](int row) { + RAJA::forall(col_range, [=](int col) { double dot = 0.0; - for (int k = 0; k < N; ++k) { + for (int k = 0; k < N; ++k) + { dot += Aview(row, k) * Bview(k, col); } Cview(row, col) = dot; - }); - }); // _matmult_nestedforall_end checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Next, we use a RAJA::kernel method to execute the kernel. These examples, -// illustrate the basic kernel interface and mechanics. The execution policies -// express the outer row and col loops using the RAJA kernel interface. Later, -// in this file we show some more complex policy examples where we express all -// three loops using the kernel interface and use additional kernel features. -// -// This is different than RAJA::forall and so a few points of exmplanation -// are in order: -// -// 1) A range and lambda index argument are required for each level in -// the loop nest. Here, we have two of each since we have a doubly-nested -// loop. -// 2) A range for each loop nest level is specified in a RAJA tuple object. -// The order of ranges in the tuple must match the order of args to the -// lambda for this to be correct, in general. RAJA provides strongly-typed -// indices to help with this. However, this example does not use them. -// 3) An execution policy is required for each level in the loop nest. These -// are specified in the 'RAJA::statement::For' templates in the -// 'RAJA::KernelPolicy type. -// 4) The loop nest ordering is specified in the nested execution policy -- -// the first 'For' policy is the outermost loop, the second 'For' policy -// is the loop nested inside the outermost loop, and so on. -// 5) The integer values that are the first template arguments to the policies -// indicate which range/lambda argument, the policy applies to. -// + // + // Next, we use a RAJA::kernel method to execute the kernel. These examples, + // illustrate the basic kernel interface and mechanics. The execution policies + // express the outer row and col loops using the RAJA kernel interface. Later, + // in this file we show some more complex policy examples where we express all + // three loops using the kernel interface and use additional kernel features. + // + // This is different than RAJA::forall and so a few points of exmplanation + // are in order: + // + // 1) A range and lambda index argument are required for each level in + // the loop nest. Here, we have two of each since we have a doubly-nested + // loop. + // 2) A range for each loop nest level is specified in a RAJA tuple object. + // The order of ranges in the tuple must match the order of args to the + // lambda for this to be correct, in general. RAJA provides strongly-typed + // indices to help with this. However, this example does not use them. + // 3) An execution policy is required for each level in the loop nest. These + // are specified in the 'RAJA::statement::For' templates in the + // 'RAJA::KernelPolicy type. + // 4) The loop nest ordering is specified in the nested execution policy -- + // the first 'For' policy is the outermost loop, the second 'For' policy + // is the loop nested inside the outermost loop, and so on. + // 5) The integer values that are the first template arguments to the policies + // indicate which range/lambda argument, the policy applies to. + // std::cout << "\n Running sequential mat-mult (RAJA-nested)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_basickernel_start - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, // row - RAJA::statement::For<0, RAJA::seq_exec, // col - RAJA::statement::Lambda<0> - > - > - >; + using EXEC_POL = RAJA::KernelPolicy< + RAJA::statement::For<1, + RAJA::seq_exec, // row + RAJA::statement::For<0, + RAJA::seq_exec, // col + RAJA::statement::Lambda<0>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - - }); + [=](int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); // _matmult_basickernel_end checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp outer)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_ompkernel_start - using EXEC_POL1 = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row - RAJA::statement::For<0, RAJA::seq_exec, // col - RAJA::statement::Lambda<0> - > - > - >; + using EXEC_POL1 = RAJA::KernelPolicy< + RAJA::statement::For<1, + RAJA::omp_parallel_for_exec, // row + RAJA::statement::For<0, + RAJA::seq_exec, // col + RAJA::statement::Lambda<0>>>>; // _matmult_ompkernel_end RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - - }); + [=](int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP mat-mult (RAJA-nested - omp inner)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // Swapping the template arguments in this nested policy swaps the loop @@ -353,70 +349,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // sequentially, while row (inner) iterations execute in parallel. // // _matmult_ompkernel_swap_start - using EXEC_POL2 = - RAJA::KernelPolicy< - RAJA::statement::For<0, RAJA::seq_exec, // col - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row - RAJA::statement::Lambda<0> - > - > - >; + using EXEC_POL2 = RAJA::KernelPolicy>>>; // _matmult_ompkernel_swap_end - RAJA::kernel( RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - - }); + RAJA::kernel(RAJA::make_tuple(col_range, row_range), + [=](int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP mat-mult (RAJA-nested - collapse)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy collapses the row and col loops in an OpenMP parallel region. // This is the same as using an OpenMP 'parallel for' directive on the // outer loop with a 'collapse(2) clause. // - using EXEC_POL3 = - RAJA::KernelPolicy< + using EXEC_POL3 = RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col - RAJA::statement::Lambda<0> - > - >; + RAJA::ArgList<1, 0>, // row, col + RAJA::statement::Lambda<0>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - - }); + [=](int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif // if RAJA_ENABLE_OPENMP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA mat-mult (RAJA-nested - POL4)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy replaces the loop nest with a single CUDA kernel launch @@ -430,35 +419,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // using EXEC_POL4 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<1, RAJA::cuda_block_x_loop, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, - RAJA::statement::Lambda<0> - > - > - > - >; + RAJA::KernelPolicy>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE (int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - - }); + [=] RAJA_DEVICE(int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tiled mat-mult (RAJA-POL5)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy collapses the col and row loops into a single CUDA kernel @@ -470,50 +454,50 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // same as in this kernel and the one above. // using EXEC_POL5 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, - RAJA::statement::For<1, RAJA::cuda_thread_y_loop, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, - RAJA::statement::Lambda<0> - > - > - > - > - > - >; + RAJA::KernelPolicy, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_loop, + RAJA::statement::For<0, + RAJA::cuda_thread_x_loop, + RAJA::statement::Lambda<0>>>>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE (int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += Aview(row, k) * Bview(k, col); - } - Cview(row, col) = dot; - - }); + [=] RAJA_DEVICE(int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += Aview(row, k) * Bview(k, col); + } + Cview(row, col) = dot; + }); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); #endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) - double *d_A = memoryManager::allocate_gpu(N * N); - double *d_B = memoryManager::allocate_gpu(N * N); - double *d_C = memoryManager::allocate_gpu(N * N); + double* d_A = memoryManager::allocate_gpu(N * N); + double* d_B = memoryManager::allocate_gpu(N * N); + double* d_C = memoryManager::allocate_gpu(N * N); std::cout << "\n Running HIP mat-mult (RAJA-nested - POL4)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); - hipErrchk(hipMemcpy( d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_B, B, N * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); RAJA::View> d_Aview(d_A, N, N); RAJA::View> d_Bview(d_B, N, N); @@ -530,38 +514,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // using EXEC_POL4 = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::For<1, RAJA::hip_block_x_loop, - RAJA::statement::For<0, RAJA::hip_thread_x_loop, - RAJA::statement::Lambda<0> - > - > - > - >; + RAJA::KernelPolicy>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE (int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += d_Aview(row, k) * d_Bview(k, col); - } - - d_Cview(row, col) = dot; - - }); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + [=] RAJA_DEVICE(int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += d_Aview(row, k) * d_Bview(k, col); + } + + d_Cview(row, col) = dot; + }); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP tiled mat-mult (RAJA-POL5)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // // This policy collapses the col and row loops into a single HIP kernel @@ -573,50 +552,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // same as in this kernel and the one above. // using EXEC_POL5 = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::hip_block_x_loop, - RAJA::statement::For<1, RAJA::hip_thread_y_loop, - RAJA::statement::For<0, RAJA::hip_thread_x_loop, - RAJA::statement::Lambda<0> - > - > - > - > - > - >; + RAJA::KernelPolicy, + RAJA::hip_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_loop, + RAJA::statement::For< + 1, + RAJA::hip_thread_y_loop, + RAJA::statement::For<0, + RAJA::hip_thread_x_loop, + RAJA::statement::Lambda<0>>>>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=] RAJA_DEVICE (int col, int row) { - - double dot = 0.0; - for (int k = 0; k < N; ++k) { - dot += d_Aview(row, k) * d_Bview(k, col); - } - - d_Cview(row, col) = dot; - - }); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + [=] RAJA_DEVICE(int col, int row) { + double dot = 0.0; + for (int k = 0; k < N; ++k) + { + dot += d_Aview(row, k) * d_Bview(k, col); + } + + d_Cview(row, col) = dot; + }); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif // if RAJA_ENABLE_HIP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// The following examples use execution policies to express the outer row and -// col loops as well as the inner dot product loop using the RAJA kernel -// interface. They show some more complex policy examples and use additional -// kernel features. -// + // + // The following examples use execution policies to express the outer row and + // col loops as well as the inner dot product loop using the RAJA kernel + // interface. They show some more complex policy examples and use additional + // kernel features. + // - std::cout << "\n Running sequential mat-mult with multiple lambdas (RAJA-POL6a)...\n"; + std::cout << "\n Running sequential mat-mult with multiple lambdas " + "(RAJA-POL6a)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // // This policy executes the col, row and k (inner dot product) loops @@ -632,310 +610,312 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // by all three lambdas. // // _matmult_3lambdakernel_seq_start - using EXEC_POL6a = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1> // inner loop: dot += ... - >, - RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C(row, col) = dot - > - > - >; + using EXEC_POL6a = RAJA::KernelPolicy>, // dot = 0.0 + RAJA::statement::For<2, + RAJA::seq_exec, + RAJA::statement::Lambda<1> // inner loop: dot += + // ... + >, + RAJA::statement:: + Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set + // C(row, + // col) + // = dot + >>>; RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=](double& dot) { dot = 0.0; }, - // lambda 1 - [=] (int col, int row, int k, double& dot) { - dot += Aview(row, k) * Bview(k, col); - }, + // lambda 1 + [=](int col, int row, int k, double& dot) { + dot += Aview(row, k) * Bview(k, col); + }, - // lambda 2 - [=] (int col, int row, double& dot) { - Cview(row, col) = dot; - } + // lambda 2 + [=](int col, int row, double& dot) { Cview(row, col) = dot; } ); // _matmult_3lambdakernel_seq_end checkResult(Cview, N); - //printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); -// -// The following examples uses an extension of the lambda statement -// to specify lambda arguments. By specifying arguments within statements -// we remove the requirement that lambdas require all of the tuple contents. -// + // + // The following examples uses an extension of the lambda statement + // to specify lambda arguments. By specifying arguments within statements + // we remove the requirement that lambdas require all of the tuple contents. + // - std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda args in statements (RAJA-POL6b)...\n"; + std::cout << "\n Running sequential mat-mult with multiple lambdas - lambda " + "args in statements (RAJA-POL6b)...\n"; // _matmult_3lambdakernel_args_seq_start // Alias for convenience - using RAJA::Segs; using RAJA::Params; + using RAJA::Segs; - using EXEC_POL6b = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ... - >, - RAJA::statement::Lambda<2, Segs<0,1>, Params<0>> // C(row, col) = dot - > - > - >; + using EXEC_POL6b = RAJA::KernelPolicy>, // dot = 0.0 + RAJA::statement::For< + 2, + RAJA::seq_exec, + RAJA::statement::Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ... + >, + RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // C(row, col) = dot + >>>; RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=](double& dot) { dot = 0.0; }, - // lambda 1 - [=] (int col, int row, int k, double& dot) { - dot += Aview(row, k) * Bview(k, col); - }, + // lambda 1 + [=](int col, int row, int k, double& dot) { + dot += Aview(row, k) * Bview(k, col); + }, - // lambda 2 - [=] (int col, int row, double& dot) { - Cview(row, col) = dot; - } + // lambda 2 + [=](int col, int row, double& dot) { Cview(row, col) = dot; } ); // _matmult_3lambdakernel_args_seq_end checkResult(Cview, N); - //printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop collapse (RAJA-POL7)...\n"; + std::cout << "\n Running OpenMP mat-mult with multiple lambdas and loop " + "collapse (RAJA-POL7)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_3lambdakernel_ompcollapse_start - using EXEC_POL7 = - RAJA::KernelPolicy< - RAJA::statement::Collapse, // row, col - RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1> // inner loop: dot += ... - >, - RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C(row, col) = dot - > - >; + using EXEC_POL7 = RAJA::KernelPolicy, // row, col + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 + RAJA::statement::For<2, + RAJA::seq_exec, + RAJA::statement::Lambda<1> // inner loop: dot += ... + >, + RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set + // C(row, + // col) = + // dot + >>; // _matmult_3lambdakernel_ompcollapse_end RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=](double& dot) { dot = 0.0; }, - // lambda 1 - [=] (int col, int row, int k, double& dot) { - dot += Aview(row, k) * Bview(k, col); - }, + // lambda 1 + [=](int col, int row, int k, double& dot) { + dot += Aview(row, k) * Bview(k, col); + }, - // lambda 2 - [=] (int col, int row, double& dot) { - Cview(row, col) = dot; - } + // lambda 2 + [=](int col, int row, double& dot) { Cview(row, col) = dot; } ); checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif // if RAJA_ENABLE_OPENMP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL8)...\n"; + std::cout << "\n Running CUDA mat-mult with multiple lambdas " + "(RAJA-POL8)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_3lambdakernel_cuda_start using EXEC_POL8 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<1, RAJA::cuda_block_x_loop, // row - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col - RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1> // dot += ... - >, - RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ... - > - > - > - >; + RAJA::KernelPolicy>, // dot = 0.0 + RAJA::statement::For<2, + RAJA::seq_exec, + RAJA::statement::Lambda<1> // dot += ... + >, + RAJA::statement:: + Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ... + >>>>; // _matmult_3lambdakernel_cuda_end RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] RAJA_DEVICE (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=] RAJA_DEVICE(double& dot) { dot = 0.0; }, - // lambda 1 - [=] RAJA_DEVICE (int col, int row, int k, double& dot) { - dot += Aview(row, k) * Bview(k, col); - }, + // lambda 1 + [=] RAJA_DEVICE(int col, int row, int k, double& dot) { + dot += Aview(row, k) * Bview(k, col); + }, - // lambda 2 - [=] RAJA_DEVICE (int col, int row, double& dot) { - Cview(row, col) = dot; - } + // lambda 2 + [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; } ); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running CUDA mat-mult with multiple lambdas (RAJA-POL9a)...\n"; + std::cout << "\n Running CUDA mat-mult with multiple lambdas " + "(RAJA-POL9a)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // _matmult_3lambdakernel_cudatiled_start using EXEC_POL9a = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::cuda_block_x_loop, - RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col - RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1> // dot += ... - >, - RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ... - > - > - > - > - > - >; + RAJA::KernelPolicy, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_loop, // row + RAJA::statement::For< + 0, + RAJA::cuda_thread_x_loop, // col + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 + RAJA::statement::For<2, + RAJA::seq_exec, + RAJA::statement::Lambda<1> // dot += + // ... + >, + RAJA::statement:: + Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C + // = ... + >>>>>>; // _matmult_3lambdakernel_cudatiled_end RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] RAJA_DEVICE (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=] RAJA_DEVICE(double& dot) { dot = 0.0; }, - // lambda 1 - [=] RAJA_DEVICE (int col, int row, int k, double& dot) { - dot += Aview(row, k) * Bview(k, col); - }, + // lambda 1 + [=] RAJA_DEVICE(int col, int row, int k, double& dot) { + dot += Aview(row, k) * Bview(k, col); + }, - // lambda 2 - [=] RAJA_DEVICE (int col, int row, double& dot) { - Cview(row, col) = dot; - } + // lambda 2 + [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; } ); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9b)...\n"; + std::cout << "\n Running CUDA mat-mult with multiple lambdas - lambda args " + "in statements (RAJA-POL9b)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); using EXEC_POL9b = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::cuda_block_x_loop, - RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // row - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col - RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ... - >, - RAJA::statement::Lambda<2, Segs<0,1>, Params<0>> // set C = ... - > - > - > - > - > - >; + RAJA::KernelPolicy, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_loop, // row + RAJA::statement::For< + 0, + RAJA::cuda_thread_x_loop, // col + RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0 + RAJA::statement::For< + 2, + RAJA::seq_exec, + RAJA::statement:: + Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ... + >, + RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set C + // = ... + >>>>>>; RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] RAJA_DEVICE (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=] RAJA_DEVICE(double& dot) { dot = 0.0; }, - // lambda 1 - [=] RAJA_DEVICE (int col, int row, int k, double& dot) { - dot += Aview(row, k) * Bview(k, col); - }, + // lambda 1 + [=] RAJA_DEVICE(int col, int row, int k, double& dot) { + dot += Aview(row, k) * Bview(k, col); + }, - // lambda 2 - [=] RAJA_DEVICE (int col, int row, double& dot) { - Cview(row, col) = dot; - } + // lambda 2 + [=] RAJA_DEVICE(int col, int row, double& dot) { Cview(row, col) = dot; } ); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running mat-mult with tiling + shared memory...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // This example builds on the RAJA tiling capabilities presented earlier // and uses RAJA LocalArray's to load tiles of the global matrix @@ -945,134 +925,152 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // synchronization. We recommend viewing tut_matrix-transpose-local-array.cpp // for an introduction to RAJA LocalArray types and thread synchronization. - using Shmem = RAJA::LocalArray>; - - using shmem_Lambda0 = RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>; - using shmem_Lambda1 = RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>; - using shmem_Lambda2 = RAJA::statement::Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>; - using shmem_Lambda3 = RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>; - using shmem_Lambda4 = RAJA::statement::Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>; - - using EXEC_POL10 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixed, + using Shmem = + RAJA::LocalArray>; + + using shmem_Lambda0 = + RAJA::statement::Lambda<0, RAJA::Offsets<0, 2>, RAJA::Params<2>>; + using shmem_Lambda1 = RAJA::statement:: + Lambda<1, RAJA::Segs<0, 1>, RAJA::Offsets<0, 1>, RAJA::Params<0>>; + using shmem_Lambda2 = RAJA::statement:: + Lambda<2, RAJA::Segs<1, 2>, RAJA::Offsets<1, 2>, RAJA::Params<1>>; + using shmem_Lambda3 = + RAJA::statement::Lambda<3, RAJA::Offsets<0, 1, 2>, RAJA::Params<0, 1, 2>>; + using shmem_Lambda4 = RAJA::statement:: + Lambda<4, RAJA::Segs<0, 2>, RAJA::Offsets<0, 2>, RAJA::Params<2>>; + + using EXEC_POL10 = RAJA::KernelPolicy, // Tile rows and cols of C (the result matrix C) - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct, - RAJA::statement::Tile<2, RAJA::tile_fixed, RAJA::cuda_block_y_direct, - - // zero out shmem tile of C - RAJA::statement::For<2, RAJA::cuda_thread_y_loop, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, - shmem_Lambda0 > >, - - // Slide window across matrix: Load tiles of global matrices A, B and compute - // local dot products - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - - // Load tile of A into shmem - RAJA::statement::For<1, RAJA::cuda_thread_y_loop, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, - shmem_Lambda1 - > - >, - - // Load tile of B into shmem - RAJA::statement::For<2, RAJA::cuda_thread_y_loop, - RAJA::statement::For<1, RAJA::cuda_thread_x_loop, - shmem_Lambda2 - > - >, - - RAJA::statement::CudaSyncThreads, - - //Partial multiplication - RAJA::statement::For<2, RAJA::cuda_thread_y_loop, - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, - shmem_Lambda3 - > - > - >, - - RAJA::statement::CudaSyncThreads - >, //sliding window - - //Write memory out to global matrix - RAJA::statement::For<2, RAJA::cuda_thread_y_loop, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, - shmem_Lambda4 > > - > - > - > //Create shared memory - >//Cuda kernel - >; - - Shmem aShared, bShared, cShared; - - RAJA::kernel_param( + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_direct, + RAJA::statement::Tile< + 2, + RAJA::tile_fixed, + RAJA::cuda_block_y_direct, + + // zero out shmem tile of C + RAJA::statement::For< + 2, + RAJA::cuda_thread_y_loop, + RAJA::statement:: + For<0, RAJA::cuda_thread_x_loop, shmem_Lambda0>>, + + // Slide window across matrix: Load tiles of global matrices + // A, B and compute local dot products + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::seq_exec, + + // Load tile of A into shmem + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_loop, + RAJA::statement:: + For<0, RAJA::cuda_thread_x_loop, shmem_Lambda1>>, + + // Load tile of B into shmem + RAJA::statement::For< + 2, + RAJA::cuda_thread_y_loop, + RAJA::statement:: + For<1, RAJA::cuda_thread_x_loop, shmem_Lambda2>>, + + RAJA::statement::CudaSyncThreads, + + // Partial multiplication + RAJA::statement::For< + 2, + RAJA::cuda_thread_y_loop, + RAJA::statement::For< + 1, + RAJA::seq_exec, + RAJA::statement::For<0, + RAJA::cuda_thread_x_loop, + shmem_Lambda3>>>, + + RAJA::statement::CudaSyncThreads>, // sliding window + + // Write memory out to global matrix + RAJA::statement::For< + 2, + RAJA::cuda_thread_y_loop, + RAJA::statement::For<0, + RAJA::cuda_thread_x_loop, + shmem_Lambda4>>>>> // Create shared + // memory + > // Cuda kernel + >; + + Shmem aShared, bShared, cShared; + + RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N), RAJA::TypedRangeSegment(0, N)), RAJA::make_tuple(aShared, bShared, cShared), - // Zero out thread local memory for storing dot products - [=] RAJA_HOST_DEVICE (int tn, int tp, Shmem &cShared) { - - cShared(tn,tp) = 0.0; - - }, - - // Load tile of A - [=] RAJA_HOST_DEVICE (int n, int m, int tn, int tm, Shmem &aShared) { - - aShared(tn, tm) = Aview(n, m); - - }, - - // Load tile of B - [=] RAJA_HOST_DEVICE (int m, int p, int tm, int tp, Shmem &bShared) { - - bShared(tm, tp) = Bview(m, p); - - }, - - // Do partial update in shmem - [=] RAJA_HOST_DEVICE (int tn, int tm, int tp, Shmem &aShared, Shmem &bShared, Shmem & cShared) { - - cShared(tn,tp) += aShared(tn,tm) * bShared(tm, tp); - - }, - - // Write out complete result - [=] RAJA_HOST_DEVICE (int n, int p, int tn, int tp, Shmem &cShared) { - - Cview(n,p) = cShared(tn,tp); - - }); + // Zero out thread local memory for storing dot products + [=] RAJA_HOST_DEVICE(int tn, int tp, Shmem& cShared) { + cShared(tn, tp) = 0.0; + }, + + // Load tile of A + [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared) { + aShared(tn, tm) = Aview(n, m); + }, + + // Load tile of B + [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared) { + bShared(tm, tp) = Bview(m, p); + }, + + // Do partial update in shmem + [=] RAJA_HOST_DEVICE(int tn, + int tm, + int tp, + Shmem& aShared, + Shmem& bShared, + Shmem& cShared) { + cShared(tn, tp) += aShared(tn, tm) * bShared(tm, tp); + }, + + // Write out complete result + [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, Shmem& cShared) { + Cview(n, p) = cShared(tn, tp); + }); checkResult(Cview, N); -//printResult(Cview, N); +// printResult(Cview, N); #endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running CUDA tiled mat-mult (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); + std::memset(C, 0, N * N * sizeof(double)); // Define thread block dimensions dim3 blockdim(CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); // Launch CUDA kernel defined near the top of this file. matMultKernel<<>>(N, C, A, B); @@ -1080,154 +1078,158 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaDeviceSynchronize(); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); #endif // if RAJA_ENABLE_CUDA -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running HIP mat-mult with multiple lambdas (RAJA-POL8)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // _matmult_3lambdakernel_hip_start using EXEC_POL8 = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::For<1, RAJA::hip_block_x_loop, // row - RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col - RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1> // dot += ... - >, - RAJA::statement::Lambda<2, - RAJA::Segs<0,1>, RAJA::Params<0>> // set C = ... - > - > - > - >; + RAJA::KernelPolicy>, // dot = 0.0 + RAJA::statement::For<2, + RAJA::seq_exec, + RAJA::statement::Lambda<1> // dot += ... + >, + RAJA::statement:: + Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ... + >>>>; // _matmult_3lambdakernel_hip_end RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] RAJA_DEVICE (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=] RAJA_DEVICE(double& dot) { dot = 0.0; }, - // lambda 1 - [=] RAJA_DEVICE (int col, int row, int k, double& dot) { - dot += d_Aview(row, k) * d_Bview(k, col); - }, + // lambda 1 + [=] RAJA_DEVICE(int col, int row, int k, double& dot) { + dot += d_Aview(row, k) * d_Bview(k, col); + }, - // lambda 2 - [=] RAJA_DEVICE (int col, int row, double& dot) { - d_Cview(row, col) = dot; - } + // lambda 2 + [=] RAJA_DEVICE(int col, int row, double& dot) { + d_Cview(row, col) = dot; + } ); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); //----------------------------------------------------------------------------// - std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in statements (RAJA-POL9)...\n"; + std::cout << "\n Running HIP mat-mult with multiple lambdas - lambda args in " + "statements (RAJA-POL9)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // _matmult_3lambdakernel_hiptiled_start using EXEC_POL9b = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::hip_block_x_loop, - RAJA::statement::For<1, RAJA::hip_thread_y_loop, // row - RAJA::statement::For<0, RAJA::hip_thread_x_loop, // col - RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0 - RAJA::statement::For<2, RAJA::seq_exec, - RAJA::statement::Lambda<1, Segs<0,1,2>, Params<0>> // dot += ... - >, - RAJA::statement::Lambda<2, Segs<0,1>, Params<0>> // set C = ... - > - > - > - > - > - >; - // _matmult_3lambdakernel_hiptiled_end + RAJA::KernelPolicy, + RAJA::hip_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_loop, + RAJA::statement::For< + 1, + RAJA::hip_thread_y_loop, // row + RAJA::statement::For< + 0, + RAJA::hip_thread_x_loop, // col + RAJA::statement::Lambda<0, Params<0>>, // dot = 0.0 + RAJA::statement::For< + 2, + RAJA::seq_exec, + RAJA::statement:: + Lambda<1, Segs<0, 1, 2>, Params<0>> // dot += ... + >, + RAJA::statement::Lambda<2, Segs<0, 1>, Params<0>> // set C + // = ... + >>>>>>; + // _matmult_3lambdakernel_hiptiled_end RAJA::kernel_param( - RAJA::make_tuple(col_range, row_range, dot_range), + RAJA::make_tuple(col_range, row_range, dot_range), - RAJA::tuple{0.0}, // thread local variable for 'dot' + RAJA::tuple{0.0}, // thread local variable for 'dot' - // lambda 0 - [=] RAJA_DEVICE (double& dot) { - dot = 0.0; - }, + // lambda 0 + [=] RAJA_DEVICE(double& dot) { dot = 0.0; }, - // lambda 1 - [=] RAJA_DEVICE (int col, int row, int k, double& dot) { - dot += d_Aview(row, k) * d_Bview(k, col); - }, + // lambda 1 + [=] RAJA_DEVICE(int col, int row, int k, double& dot) { + dot += d_Aview(row, k) * d_Bview(k, col); + }, - // lambda 2 - [=] RAJA_DEVICE (int col, int row, double& dot) { - d_Cview(row, col) = dot; - } + // lambda 2 + [=] RAJA_DEVICE(int col, int row, double& dot) { + d_Cview(row, col) = dot; + } ); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP tiled mat-mult (no RAJA)...\n"; - std::memset(C, 0, N*N * sizeof(double)); - hipErrchk(hipMemcpy( d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice )); + std::memset(C, 0, N * N * sizeof(double)); + hipErrchk(hipMemcpy(d_C, C, N * N * sizeof(double), hipMemcpyHostToDevice)); // Define thread block dimensions dim3 blockdim(HIP_BLOCK_SIZE, HIP_BLOCK_SIZE); // Define grid dimensions to match the RAJA version above - dim3 griddim(RAJA_DIVIDE_CEILING_INT(N,blockdim.x), - RAJA_DIVIDE_CEILING_INT(N,blockdim.y)); + dim3 griddim(RAJA_DIVIDE_CEILING_INT(N, blockdim.x), + RAJA_DIVIDE_CEILING_INT(N, blockdim.y)); -//printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, (int)griddim.y, (int)blockdim.x, (int)blockdim.y); + // printf("griddim = (%d,%d), blockdim = (%d,%d)\n", (int)griddim.x, + // (int)griddim.y, (int)blockdim.x, (int)blockdim.y); // Launch HIP kernel defined near the top of this file. - hipLaunchKernelGGL((matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); + hipLaunchKernelGGL( + (matMultKernel), dim3(griddim), dim3(blockdim), 0, 0, N, d_C, d_A, d_B); hipDeviceSynchronize(); - hipErrchk(hipMemcpy( C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(C, d_C, N * N * sizeof(double), hipMemcpyDeviceToHost)); checkResult(Cview, N); -//printResult(Cview, N); + // printResult(Cview, N); memoryManager::deallocate_gpu(d_A); memoryManager::deallocate_gpu(d_B); memoryManager::deallocate_gpu(d_C); #endif // if RAJA_ENABLE_HIP -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); @@ -1244,16 +1246,22 @@ template void checkResult(T* C, int N) { bool match = true; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - if ( std::abs( C(row, col) - row * col * N ) > 10e-12 ) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + if (std::abs(C(row, col) - row * col * N) > 10e-12) + { match = false; } } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -1262,16 +1270,22 @@ template void checkResult(RAJA::View> Cview, int N) { bool match = true; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - if ( std::abs( Cview(row, col) - row * col * N ) > 10e-12 ) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + if (std::abs(Cview(row, col) - row * col * N) > 10e-12) + { match = false; } } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -1283,10 +1297,12 @@ template void printResult(T* C, int N) { std::cout << std::endl; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - std::cout << "C(" << row << "," << col << ") = " - << C(row, col) << std::endl; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + std::cout << "C(" << row << "," << col << ") = " << C(row, col) + << std::endl; } } std::cout << std::endl; @@ -1296,10 +1312,12 @@ template void printResult(RAJA::View> Cview, int N) { std::cout << std::endl; - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - std::cout << "C(" << row << "," << col << ") = " - << Cview(row, col) << std::endl; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + std::cout << "C(" << row << "," << col << ") = " << Cview(row, col) + << std::endl; } } std::cout << std::endl; diff --git a/examples/wave-eqn.cpp b/examples/wave-eqn.cpp index e3b83480ee..ce1a8fc101 100644 --- a/examples/wave-eqn.cpp +++ b/examples/wave-eqn.cpp @@ -13,7 +13,7 @@ #include "RAJA/RAJA.hpp" /* - * Time-Domain Finite Difference + * Time-Domain Finite Difference * Acoustic Wave Equation Solver * * ------[Details]---------------------- @@ -26,7 +26,7 @@ * The scheme uses a second order central difference discretization * for time and a fourth order central difference discretization for space. * Periodic boundary conditions are assumed on the grid [-1,1] x [-1, 1]. - * + * * NOTE: The x and y dimensions are discretized identically. * ----[RAJA Concepts]------------------- * - RAJA kernels are portable and a single implemenation can run @@ -34,7 +34,7 @@ * * RAJA MaxReduction - RAJA's implementation for computing a maximum value * (MinReduction computes the min) -*/ + */ // // ---[Constant Values]------- @@ -51,7 +51,8 @@ const double PI = 3.14159265359; // h - Spacing between grid points // n - Number of grid points // -struct grid_s { +struct grid_s +{ double ox, dx; int nx; }; @@ -66,16 +67,17 @@ struct grid_s { // template -void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx); +void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx); double waveSol(double t, double x, double y); -void setIC(double *P1, double *P2, double t0, double t1, grid_s grid); -void computeErr(double *P, double tf, grid_s grid); +void setIC(double* P1, double* P2, double t0, double t1, grid_s grid); +void computeErr(double* P, double tf, grid_s grid); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout<<"Time-Domain Finite Difference Acoustic Wave Equation Solver"<(entries); - double *P2 = memoryManager::allocate(entries); + double* P1 = memoryManager::allocate(entries); + double* P2 = memoryManager::allocate(entries); // //----[Time stepping parameters]---- @@ -123,21 +125,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Sequential policy - using fdPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; + using fdPolicy = RAJA::KernelPolicy>>>; // OpenMP policy - //using fdPolicy = RAJA::KernelPolicy< - //RAJA::statement::For<1, RAJA::omp_parallel_for_exec, + // using fdPolicy = RAJA::KernelPolicy< + // RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; // CUDA policy - //using fdPolicy = - //RAJA::KernelPolicy< + // using fdPolicy = + // RAJA::KernelPolicy< // RAJA::statement::CudaKernel< - // RAJA::statement::Tile<1, RAJA::tile_fixed<16>, RAJA::cuda_block_y_direct, - // RAJA::statement::Tile<0, RAJA::tile_fixed<16>, RAJA::cuda_block_x_direct, + // RAJA::statement::Tile<1, RAJA::tile_fixed<16>, + // RAJA::cuda_block_y_direct, + // RAJA::statement::Tile<0, RAJA::tile_fixed<16>, + // RAJA::cuda_block_x_direct, // RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // RAJA::statement::Lambda<0> @@ -151,13 +156,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) time = 0; setIC(P1, P2, (time - dt), time, grid); - for (int k = 0; k < nt; ++k) { + for (int k = 0; k < nt; ++k) + { wave(P1, P2, fdBounds, ct, grid.nx); time += dt; - double *Temp = P2; + double* Temp = P2; P2 = P1; P1 = Temp; } @@ -185,29 +191,30 @@ double waveSol(double t, double x, double y) // // Error is computed via ||P_{approx}(:) - P_{analytic}(:)||_{inf} // -void computeErr(double *P, double tf, grid_s grid) +void computeErr(double* P, double tf, grid_s grid) { RAJA::RangeSegment fdBounds(0, grid.nx); RAJA::ReduceMax tMax(-1.0); - using initialPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec , - RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; - - RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), - [=] (RAJA::Index_type tx, RAJA::Index_type ty) { - - int id = tx + grid.nx * ty; - double x = grid.ox + tx * grid.dx; - double y = grid.ox + ty * grid.dx; - double myErr = std::abs(P[id] - waveSol(tf, x, y)); - - // - // tMax.max() is used to store the maximum value - // - tMax.max(myErr); - }); + using initialPolicy = RAJA::KernelPolicy>>>; + + RAJA::kernel(RAJA::make_tuple(fdBounds, fdBounds), + [=](RAJA::Index_type tx, RAJA::Index_type ty) { + int id = tx + grid.nx * ty; + double x = grid.ox + tx * grid.dx; + double y = grid.ox + ty * grid.dx; + double myErr = + std::abs(P[id] - waveSol(tf, x, y)); + + // + // tMax.max() is used to store the maximum value + // + tMax.max(myErr); + }); double lInfErr = tMax; printf("Max Error = %lg, dx = %f \n", lInfErr, grid.dx); @@ -217,63 +224,64 @@ void computeErr(double *P, double tf, grid_s grid) // // Function to set intial condition // -void setIC(double *P1, double *P2, double t0, double t1, grid_s grid) +void setIC(double* P1, double* P2, double t0, double t1, grid_s grid) { RAJA::RangeSegment fdBounds(0, grid.nx); - using initialPolicy = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; - - RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), - [=] (RAJA::Index_type tx, RAJA::Index_type ty) { - - int id = tx + ty * grid.nx; - double x = grid.ox + tx * grid.dx; - double y = grid.ox + ty * grid.dx; - - P1[id] = waveSol(t0, x, y); - P2[id] = waveSol(t1, x, y); - }); -} + using initialPolicy = RAJA::KernelPolicy>>>; + RAJA::kernel(RAJA::make_tuple(fdBounds, fdBounds), + [=](RAJA::Index_type tx, RAJA::Index_type ty) { + int id = tx + ty * grid.nx; + double x = grid.ox + tx * grid.dx; + double y = grid.ox + ty * grid.dx; + + P1[id] = waveSol(t0, x, y); + P2[id] = waveSol(t1, x, y); + }); +} template -void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx) +void wave(T* P1, T* P2, RAJA::RangeSegment fdBounds, double ct, int nx) { - RAJA::kernel(RAJA::make_tuple(fdBounds,fdBounds), - [=] RAJA_HOST_DEVICE (RAJA::Index_type tx, RAJA::Index_type ty) { - // - //Coefficients for fourth order stencil - // - double coeff[5] = { -1.0/12.0, 4.0/3.0, -5.0/2.0, 4.0/3.0, -1.0/12.0}; - - const int id = tx + ty * nx; - double P_old = P1[id]; - double P_curr = P2[id]; - - // - // Compute Laplacian - // - double lap = 0.0; - - for (auto r : RAJA::RangeSegment(-sr, sr + 1)) { - const int xi = (tx + r + nx) % nx; - const int idx = xi + nx * ty; - lap += coeff[r + sr] * P2[idx]; - - const int yi = (ty + r + nx) % nx; - const int idy = tx + nx * yi; - lap += coeff[r + sr] * P2[idy]; - } - - // - // Store result - // - P1[id] = 2 * P_curr - P_old + ct * lap; - - }); + RAJA::kernel( + RAJA::make_tuple(fdBounds, fdBounds), + [=] RAJA_HOST_DEVICE(RAJA::Index_type tx, RAJA::Index_type ty) { + // + // Coefficients for fourth order stencil + // + double coeff[5] = { + -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0}; + + const int id = tx + ty * nx; + double P_old = P1[id]; + double P_curr = P2[id]; + + // + // Compute Laplacian + // + double lap = 0.0; + + for (auto r : RAJA::RangeSegment(-sr, sr + 1)) + { + const int xi = (tx + r + nx) % nx; + const int idx = xi + nx * ty; + lap += coeff[r + sr] * P2[idx]; + + const int yi = (ty + r + nx) % nx; + const int idy = tx + nx * yi; + lap += coeff[r + sr] * P2[idy]; + } + + // + // Store result + // + P1[id] = 2 * P_curr - P_old + ct * lap; + }); } diff --git a/exercises/atomic-histogram.cpp b/exercises/atomic-histogram.cpp index 602a04a10e..ecdc1a9e7d 100644 --- a/exercises/atomic-histogram.cpp +++ b/exercises/atomic-histogram.cpp @@ -19,9 +19,9 @@ * * In this exercise, you will use use RAJA atomic operations to compute * an array which represents a histogram of values in another array. - * Given an array of length N containing integers in the interval [0, M), - * you will compute entries in an array 'hist' of length M. Each entry - * hist[i] in the histogram array will equal the number of occurrences of + * Given an array of length N containing integers in the interval [0, M), + * you will compute entries in an array 'hist' of length M. Each entry + * hist[i] in the histogram array will equal the number of occurrences of * the value 'i' in the orginal array. * * This file contains sequential and OpenMP variants of the histogram @@ -41,11 +41,11 @@ Specifies the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//const int CUDA_BLOCK_SIZE = 256; +// const int CUDA_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_HIP) -//const int HIP_BLOCK_SIZE = 256; +// const int HIP_BLOCK_SIZE = 256; #endif // @@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array bounds and initialize array to compute histogram of values - // on. + // on. // // _array_atomic_histogram_start @@ -72,31 +72,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* array = memoryManager::allocate(N); int* hist = memoryManager::allocate(M); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { array[i] = rand() % M; } // _array_atomic_histogram_end int* hist_ref = memoryManager::allocate(M); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential historgram...\n"; std::memset(hist_ref, 0, M * sizeof(int)); - for (int i = 0; i < N; ++i) { - hist_ref[ array[i] ]++; + for (int i = 0; i < N; ++i) + { + hist_ref[array[i]]++; } -//printArray(hist_ref, M); + // printArray(hist_ref, M); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -104,50 +106,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - #pragma omp parallel for - for (int i = 0; i < N; ++i) { - #pragma omp atomic - hist[ array[i] ]++; +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { +#pragma omp atomic + hist[array[i]]++; } checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -#endif +#endif -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential atomic histogram...\n"; std::memset(hist, 0, M * sizeof(int)); // _range_atomic_histogram_start - //RAJA::TypedRangeSegment array_range(0,N); + // RAJA::TypedRangeSegment array_range(0,N); // _range_atomic_histogram_end /// /// TODO... /// /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall - /// method with RAJA::seq_exec execution policy type and a + /// method with RAJA::seq_exec execution policy type and a /// RAJA::atomicAdd operation with RAJA::seq_atomic policy. /// /// You will need to uncomment the range segment definition /// above to use it in the kernel. /// - //RAJA::forall(array_range, [=](int i) { - //}); + // RAJA::forall(array_range, [=](int i) { + // }); checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -//----------------------------------------------------------------------------// -// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -159,44 +162,44 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall - /// method with RAJA::omp_parallel_for_exec execution policy type + /// method with RAJA::omp_parallel_for_exec execution policy type /// and a RAJA::atomicAdd operation with RAJA::omp_atomic policy. - /// + /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA OpenMP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA OpenMP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); /// /// TODO... /// /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall - /// method with RAJA::omp_parallel_for_exec execution policy type + /// method with RAJA::omp_parallel_for_exec execution policy type /// and a RAJA::atomicAdd operation with RAJA::auto_atomic policy. /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -216,20 +219,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA CUDA -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA CUDA + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); /// @@ -242,15 +245,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA hip_atomic policy is used with the RAJA HIP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA hip_atomic policy is used with the RAJA HIP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -270,20 +273,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA HIP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA HIP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); /// @@ -296,9 +299,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif @@ -321,12 +324,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void checkResult(int* hist, int* hist_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && hist[i] != hist_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && hist[i] != hist_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -337,7 +347,8 @@ void checkResult(int* hist, int* hist_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/exercises/atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp index 368f729ebc..bb3380ffc4 100644 --- a/exercises/atomic-histogram_solution.cpp +++ b/exercises/atomic-histogram_solution.cpp @@ -19,9 +19,9 @@ * * In this exercise, you will use use RAJA atomic operations to compute * an array which represents a histogram of values in another array. - * Given an array of length N containing integers in the interval [0, M), - * you will compute entries in an array 'hist' of length M. Each entry - * hist[i] in the histogram array will equal the number of occurrences of + * Given an array of length N containing integers in the interval [0, M), + * you will compute entries in an array 'hist' of length M. Each entry + * hist[i] in the histogram array will equal the number of occurrences of * the value 'i' in the orginal array. * * This file contains sequential and OpenMP variants of the histogram @@ -62,7 +62,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array bounds and initialize array to compute histogram of values - // on. + // on. // // _array_atomic_histogram_start @@ -72,31 +72,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* array = memoryManager::allocate(N); int* hist = memoryManager::allocate(M); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { array[i] = rand() % M; } // _array_atomic_histogram_end int* hist_ref = memoryManager::allocate(M); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential historgram...\n"; std::memset(hist_ref, 0, M * sizeof(int)); - for (int i = 0; i < N; ++i) { - hist_ref[ array[i] ]++; + for (int i = 0; i < N; ++i) + { + hist_ref[array[i]]++; } -//printArray(hist_ref, M); + // printArray(hist_ref, M); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -104,43 +106,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - #pragma omp parallel for - for (int i = 0; i < N; ++i) { - #pragma omp atomic - hist[ array[i] ]++; +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { +#pragma omp atomic + hist[array[i]]++; } checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -#endif +#endif -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential atomic histogram...\n"; std::memset(hist, 0, M * sizeof(int)); - // _range_atomic_histogram_start - RAJA::TypedRangeSegment array_range(0,N); - // _range_atomic_histogram_end + // _range_atomic_histogram_start + RAJA::TypedRangeSegment array_range(0, N); + // _range_atomic_histogram_end RAJA::forall(array_range, [=](int i) { - RAJA::atomicAdd(&hist[array[i]], 1); - }); checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); -//----------------------------------------------------------------------------// -// RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA omp_atomic policy is used with the RAJA OpenMP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -148,46 +149,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - // _rajaomp_atomic_histogram_start + // _rajaomp_atomic_histogram_start RAJA::forall(array_range, [=](int i) { - RAJA::atomicAdd(&hist[array[i]], 1); - }); // _rajaomp_atomic_histogram_end checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA OpenMP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA OpenMP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n"; - + std::memset(hist, 0, M * sizeof(int)); RAJA::forall(array_range, [=](int i) { - RAJA::atomicAdd(&hist[array[i]], 1); - }); - + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA cuda_atomic policy is used with the RAJA CUDA execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -195,47 +192,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - // _rajacuda_atomic_histogram_start - RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { - - RAJA::atomicAdd(&hist[array[i]], 1); - - }); + // _rajacuda_atomic_histogram_start + RAJA::forall>( + array_range, [=] RAJA_DEVICE(int i) { + RAJA::atomicAdd(&hist[array[i]], 1); + }); // _rajacuda_atomic_histogram_end checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA CUDA -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA CUDA + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n"; - - std::memset(hist, 0, M * sizeof(int)); - - // _rajacuda_atomicauto_histogram_start - RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { - RAJA::atomicAdd(&hist[array[i]], 1); + std::memset(hist, 0, M * sizeof(int)); - }); + // _rajacuda_atomicauto_histogram_start + RAJA::forall>( + array_range, [=] RAJA_DEVICE(int i) { + RAJA::atomicAdd(&hist[array[i]], 1); + }); // _rajacuda_atomicauto_histogram_end - + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA hip_atomic policy is used with the RAJA HIP execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA hip_atomic policy is used with the RAJA HIP execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -243,41 +238,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); - // _rajahip_atomic_histogram_start - RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { - - RAJA::atomicAdd(&hist[array[i]], 1); - - }); + // _rajahip_atomic_histogram_start + RAJA::forall>( + array_range, [=] RAJA_DEVICE(int i) { + RAJA::atomicAdd(&hist[array[i]], 1); + }); // _rajahip_atomic_histogram_end checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif -//----------------------------------------------------------------------------// -// RAJA auto_atomic policy can also be used with the RAJA HIP -// execution policy. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA auto_atomic policy can also be used with the RAJA HIP + // execution policy. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n"; - - std::memset(hist, 0, M * sizeof(int)); - - // _rajahip_atomicauto_histogram_start - RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { - RAJA::atomicAdd(&hist[array[i]], 1); + std::memset(hist, 0, M * sizeof(int)); - }); + // _rajahip_atomicauto_histogram_start + RAJA::forall>( + array_range, [=] RAJA_DEVICE(int i) { + RAJA::atomicAdd(&hist[array[i]], 1); + }); // _rajahip_atomicauto_histogram_end - + checkResult(hist, hist_ref, M); -//printArray(hist, M); + // printArray(hist, M); #endif @@ -300,12 +293,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void checkResult(int* hist, int* hist_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && hist[i] != hist_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && hist[i] != hist_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -316,7 +316,8 @@ void checkResult(int* hist, int* hist_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp index c2830c6cb2..9625220983 100644 --- a/exercises/dot-product.cpp +++ b/exercises/dot-product.cpp @@ -14,9 +14,9 @@ /* * Vector Dot Product Exercise * - * Computes dot = (a,b), where a, b are vectors of + * Computes dot = (a,b), where a, b are vectors of * doubles and dot is a scalar double. It illustrates how RAJA - * supports a portable parallel reduction opertion in a way that + * supports a portable parallel reduction opertion in a way that * the code looks like it does in a sequential implementation. * * RAJA features shown: @@ -33,38 +33,40 @@ // void checkResult(double compdot, double refdot); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: vector dot product...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data -// - double *a = memoryManager::allocate(N); - double *b = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + double* a = memoryManager::allocate(N); + double* b = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = 1.0; b[i] = 1.0; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// C-style dot product operation. -// + // + // C-style dot product operation. + // std::cout << "\n Running C-version of dot product...\n"; // _csytle_dotprod_start double dot = 0.0; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { dot += a[i] * b[i]; } @@ -73,7 +75,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) double dot_ref = dot; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential dot product...\n"; @@ -83,16 +85,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::seq_exec - /// execution policy type and RAJA::seq_reduce. + /// execution policy type and RAJA::seq_reduce. /// /// NOTE: We've done this one for you to help you get started... /// RAJA::ReduceSum seqdot(0.0); - RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { - seqdot += a[i] * b[i]; - }); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { seqdot += a[i] * b[i]; }); dot = seqdot.get(); @@ -101,7 +102,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP dot product...\n"; @@ -111,8 +112,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement the dot product kernel using a RAJA::omp_parallel_for_exec - /// execution policy type and RAJA::omp_reduce reduction policy type. + /// EXERCISE: Implement the dot product kernel using a + /// RAJA::omp_parallel_for_exec + /// execution policy type and RAJA::omp_reduce reduction policy + /// type. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -121,11 +124,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//const int CUDA_BLOCK_SIZE = 256; + // const int CUDA_BLOCK_SIZE = 256; std::cout << "\n Running RAJA CUDA dot product...\n"; @@ -135,10 +138,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::cuda_exec - /// execution policy type and RAJA::cuda_reduce reduction policy type. - /// + /// execution policy type and RAJA::cuda_reduce reduction policy + /// type. + /// /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' above. - /// if you want to use it here. + /// if you want to use it here. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -146,30 +150,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//const int HIP_BLOCK_SIZE = 256; + // const int HIP_BLOCK_SIZE = 256; std::cout << "\n Running RAJA HIP dot product...\n"; dot = 0.0; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::hip_exec - /// execution policy type and RAJA::hip_reduce reduction policy type. - /// + /// execution policy type and RAJA::hip_reduce reduction policy + /// type. + /// /// NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' above - /// if you want to use it here. + /// if you want to use it here. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -180,11 +185,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_b); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) -//const int SYCL_BLOCK_SIZE = 256; + // const int SYCL_BLOCK_SIZE = 256; std::cout << "\n Running RAJA SYCL dot product...\n"; @@ -194,10 +199,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the dot product kernel using a RAJA::sycl_exec - /// execution policy type and RAJA::sycl_reduce. + /// execution policy type and RAJA::sycl_reduce. /// /// NOTE: You will need to uncomment 'SYCL_BLOCK_SIZE' above - /// if you want to use it here. + /// if you want to use it here. /// std::cout << "\t (a, b) = " << dot << std::endl; @@ -206,7 +211,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// memoryManager::deallocate(a); @@ -222,10 +227,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // void checkResult(double compdot, double refdot) { - if ( compdot == refdot ) { + if (compdot == refdot) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } - diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp index d0ae458171..ea8acae3c4 100644 --- a/exercises/dot-product_solution.cpp +++ b/exercises/dot-product_solution.cpp @@ -16,9 +16,9 @@ /* * Vector Dot Product Exercise * - * Computes dot = (a,b), where a, b are vectors of + * Computes dot = (a,b), where a, b are vectors of * doubles and dot is a scalar double. It illustrates how RAJA - * supports a portable parallel reduction opertion in a way that + * supports a portable parallel reduction opertion in a way that * the code looks like it does in a sequential implementation. * * RAJA features shown: @@ -35,38 +35,40 @@ // void checkResult(double compdot, double refdot); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: vector dot product...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data -// - double *a = memoryManager::allocate(N); - double *b = memoryManager::allocate(N); + // + // Allocate and initialize vector data + // + double* a = memoryManager::allocate(N); + double* b = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = 1.0; b[i] = 1.0; } -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// C-style dot product operation. -// + // + // C-style dot product operation. + // std::cout << "\n Running C-version of dot product...\n"; // _csytle_dotprod_start double dot = 0.0; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { dot += a[i] * b[i]; } @@ -75,7 +77,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) double dot_ref = dot; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential dot product...\n"; @@ -84,9 +86,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajaseq_dotprod_start RAJA::ReduceSum seqdot(0.0); - RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { - seqdot += a[i] * b[i]; - }); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { seqdot += a[i] * b[i]; }); dot = seqdot.get(); // _rajaseq_dotprod_end @@ -96,7 +97,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP dot product...\n"; @@ -106,9 +107,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajaomp_dotprod_start RAJA::ReduceSum ompdot(0.0); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - ompdot += a[i] * b[i]; - }); + RAJA::forall( + RAJA::RangeSegment(0, N), [=](int i) { ompdot += a[i] * b[i]; }); dot = ompdot.get(); // _rajaomp_dotprod_end @@ -119,7 +119,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -132,10 +132,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajacuda_dotprod_start RAJA::ReduceSum cudot(0.0); - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - cudot += a[i] * b[i]; - }); + RAJA::forall>( + RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE(int i) { cudot += a[i] * b[i]; }); dot = cudot.get(); // _rajacuda_dotprod_end @@ -145,7 +144,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(dot, dot_ref); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -155,19 +154,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) dot = 0.0; - double *d_a = memoryManager::allocate_gpu(N); - double *d_b = memoryManager::allocate_gpu(N); + double* d_a = memoryManager::allocate_gpu(N); + double* d_b = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(double), hipMemcpyHostToDevice)); // _rajahip_dotprod_start RAJA::ReduceSum hpdot(0.0); - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - hpdot += d_a[i] * d_b[i]; - }); + RAJA::forall>( + RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE(int i) { hpdot += d_a[i] * d_b[i]; }); dot = hpdot.get(); // _rajahip_dotprod_end @@ -180,7 +178,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_b); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) @@ -193,10 +191,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _rajasycl_dotprod_start RAJA::ReduceSum hpdot(0.0); - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - hpdot += a[i] * b[i]; - }); + RAJA::forall>( + RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE(int i) { hpdot += a[i] * b[i]; }); dot = static_cast(hpdot.get()); // _rajasycl_dotprod_end @@ -207,7 +204,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// memoryManager::deallocate(a); @@ -223,10 +220,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // void checkResult(double compdot, double refdot) { - if ( compdot == refdot ) { + if (compdot == refdot) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } - diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp index 227af7d2be..006dd27e34 100644 --- a/exercises/kernel-matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -66,7 +66,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; @@ -87,8 +87,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -104,8 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -120,8 +122,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -132,14 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -151,19 +158,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -186,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _mattranspose_localarray_start - using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + using TILE_MEM = RAJA:: + LocalArray, RAJA::SizeList>; TILE_MEM Tile_Array; // _mattranspose_localarray_end @@ -214,19 +223,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Initialize the local memory statement as position 2 + /// EXERCISE: Initialize the local memory statement as position 2 /// in the paramater list. /// - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::Lambda<0> + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::seq_exec, RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::seq_exec, RAJA::statement::Lambda<0> > >, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::Lambda<1> + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, + RAJA::seq_exec, RAJA::statement::ForICount<1, RAJA::statement::Param<0>, + RAJA::seq_exec, RAJA::statement::Lambda<1> > > @@ -235,7 +244,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -277,8 +286,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, + RAJA::omp_parallel_for_exec, RAJA::statement::Tile<0, + RAJA::tile_fixed, RAJA::seq_exec, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -293,7 +303,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Use two ForICount statements with seq_exec to call the first lambda. + /// EXERCISE: Use two ForICount statements with seq_exec to call the + first lambda. /// // @@ -308,7 +319,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Use two ForICount statements with seq_exec to call the second lambda. + /// EXERCISE: Use two ForICount statements with seq_exec to call the + second lambda. /// > > @@ -343,65 +355,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); - using OPENMP_EXEC_2_POL = - RAJA::KernelPolicy< - // - // (0) Execution policies for outer loops - // These loops iterate over the number of - // tiles needed to carry out the transpose - // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - // This statement will initalize local array memory inside a - // kernel. The cpu_tile_mem policy specifies that memory should be - // allocated on the stack. The entries in the RAJA::ParamList - // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, - // - // (1) Execution policies for the first set of inner - // loops. These loops copy data from the global matrices - // to the local tile. - // - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - >, - // - // (2) Execution policies for the second set of inner - // loops. These loops copy data from the local tile to - // the global matrix. - // Note: The order of the loops have been - // swapped! This enables us to swap which - // index has unit stride. - // - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::Lambda<1> - > - > - > - > - > - >; + using OPENMP_EXEC_2_POL = RAJA::KernelPolicy< + // + // (0) Execution policies for outer loops + // These loops iterate over the number of + // tiles needed to carry out the transpose + // + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem< + RAJA::cpu_tile_mem, + RAJA::ParamList<2>, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<1>, + RAJA::omp_parallel_for_exec, + RAJA::statement::ForICount<0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::Lambda<0>>>, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::Lambda<1>>>>>>>; RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = Aview(row, col); - - }, - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - Atview(col, row) = Tile_Array(ty, tx); + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, - } - ); + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_r, N_c); @@ -413,87 +430,89 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); - using CUDA_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< + using CUDA_EXEC_POL = RAJA::KernelPolicy, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, - // This statement will initalize local array memory inside a - // kernel. The cpu_tile_mem policy specifies that memory should be - // allocated on the stack. The entries in the RAJA::ParamList - // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, - // - // (1) Execution policies for the first set of inner - // loops. These loops copy data from the global matrices - // to the local tile. - // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<0> - > - >, - // Synchronize threads to ensure all loads - // to the local array are complete - RAJA::statement::CudaSyncThreads, - // - // (2) Execution policies for the second set of inner - // loops. These loops copy data from the local tile to - // the global matrix. - // Note: The order of the loops have been - // swapped! This enables us to swap which - // index has unit stride. - // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<1> - > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::CudaSyncThreads - > - > - > - > - >; + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem< + RAJA::cuda_shared_mem, + RAJA::ParamList<2>, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<0>, + RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<0, + RAJA::statement::Param<1>, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0>>>, + // Synchronize threads to ensure all loads + // to the local array are complete + RAJA::statement::CudaSyncThreads, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<1>, + RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<1, + RAJA::statement::Param<0>, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<1>>>, + // Synchronize threads to ensure all reads + // from the local array are complete + RAJA::statement::CudaSyncThreads>>>>>; RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = Aview(row, col); - - }, - - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - Atview(col, row) = Tile_Array(ty, tx); + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, - } - ); + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -505,93 +524,98 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); - using HIP_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< + using HIP_EXEC_POL = RAJA::KernelPolicy, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, - // This statement will initalize local array memory inside a - // kernel. The cpu_tile_mem policy specifies that memory should be - // allocated on the stack. The entries in the RAJA::ParamList - // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, - // - // (1) Execution policies for the first set of inner - // loops. These loops copy data from the global matrices - // to the local tile. - // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct, - RAJA::statement::Lambda<0> - > - >, - // Synchronize threads to ensure all loads - // to the local array are complete - RAJA::statement::HipSyncThreads, - // - // (2) Execution policies for the second set of inner - // loops. These loops copy data from the local tile to - // the global matrix. - // Note: The order of the loops have been - // swapped! This enables us to swap which - // index has unit stride. - // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct, - RAJA::statement::Lambda<1> - > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::HipSyncThreads - > - > - > - > - >; + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::hip_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_loop, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem< + RAJA::hip_shared_mem, + RAJA::ParamList<2>, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<0>, + RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<0, + RAJA::statement::Param<1>, + RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<0>>>, + // Synchronize threads to ensure all loads + // to the local array are complete + RAJA::statement::HipSyncThreads, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<1>, + RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<1, + RAJA::statement::Param<0>, + RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<1>>>, + // Synchronize threads to ensure all reads + // from the local array are complete + RAJA::statement::HipSyncThreads>>>>>; RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = d_Aview(row, col); + }, - Tile_Array(ty, tx) = d_Aview(row, col); + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + d_Atview(col, row) = Tile_Array(ty, tx); + }); - }, - - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - d_Atview(col, row) = Tile_Array(ty, tx); - - } - ); - - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n"; + std::cout << "\n Running RAJA - sequential matrix transpose exercise with " + "args in statement ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - //Alias for convenience - using RAJA::Segs; + // Alias for convenience using RAJA::Offsets; using RAJA::Params; + using RAJA::Segs; // _mattranspose_localarray_raja_lambdaargs_start /// @@ -609,7 +633,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> > + RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, + Offsets<1>, Params<0> > > >, @@ -624,7 +649,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), RAJA::TypedRangeSegment(0, N_r)), @@ -644,7 +669,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } @@ -657,16 +682,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -678,8 +709,10 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) << std::endl; } diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index 7b44cd3453..802f07826e 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -66,7 +66,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; @@ -87,8 +87,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -104,8 +104,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -120,8 +122,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -132,14 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -151,19 +158,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -186,8 +195,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // _mattranspose_localarray_start - using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + using TILE_MEM = RAJA:: + LocalArray, RAJA::SizeList>; TILE_MEM Tile_Array; // _mattranspose_localarray_end @@ -200,43 +209,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _mattranspose_localarray_raja_start - using SEQ_EXEC_POL_I = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - - RAJA::statement::InitLocalMem, - - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - >, - - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::Lambda<1> - > - > - - > - > - > - >; - - RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Tile_Array(ty, tx) = Aview(row, col); - }, - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Atview(col, row) = Tile_Array(ty, tx); - } + using SEQ_EXEC_POL_I = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + + RAJA::statement::InitLocalMem< + RAJA::cpu_tile_mem, + RAJA::ParamList<2>, + + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::ForICount<0, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::Lambda<0>>>, + + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::ForICount<1, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::Lambda<1>>> + + >>>>; + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + + RAJA::make_tuple((int)0, (int)0, Tile_Array), + + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, + + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + } ); // _mattranspose_localarray_raja_end @@ -252,65 +270,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); - using OPENMP_EXEC_1_POL = - RAJA::KernelPolicy< - // - // (0) Execution policies for outer loops - // These loops iterate over the number of - // tiles needed to carry out the transpose - // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - // This statement will initalize local array memory inside a - // kernel. The cpu_tile_mem policy specifies that memory should be - // allocated on the stack. The entries in the RAJA::ParamList - // identify RAJA local arrays in the parameter tuple to intialize. - RAJA::statement::InitLocalMem, - // - // (1) Execution policies for the first set of inner - // loops. These loops copy data from the global matrices - // to the local tile. - // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - >, - // - // (2) Execution policies for the second set of inner - // loops. These loops copy data from the local tile to - // the global matrix. - // Note: The order of the loops have been - // swapped! This enables us to swap which - // index has unit stride. - // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::Lambda<1> - > - > - > - > - > - >; + using OPENMP_EXEC_1_POL = RAJA::KernelPolicy< + // + // (0) Execution policies for outer loops + // These loops iterate over the number of + // tiles needed to carry out the transpose + // + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::omp_parallel_for_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays in the parameter tuple to intialize. + RAJA::statement::InitLocalMem< + RAJA::cpu_tile_mem, + RAJA::ParamList<2>, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::ForICount<0, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::Lambda<0>>>, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::Lambda<1>>>>>>>; RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = Aview(row, col); - - }, - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - Atview(col, row) = Tile_Array(ty, tx); + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, - } - ); + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -321,65 +344,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); - using OPENMP_EXEC_2_POL = - RAJA::KernelPolicy< - // - // (0) Execution policies for outer loops - // These loops iterate over the number of - // tiles needed to carry out the transpose - // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - // This statement will initalize local array memory inside a - // kernel. The cpu_tile_mem policy specifies that memory should be - // allocated on the stack. The entries in the RAJA::ParamList - // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, - // - // (1) Execution policies for the first set of inner - // loops. These loops copy data from the global matrices - // to the local tile. - // - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - >, - // - // (2) Execution policies for the second set of inner - // loops. These loops copy data from the local tile to - // the global matrix. - // Note: The order of the loops have been - // swapped! This enables us to swap which - // index has unit stride. - // - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::Lambda<1> - > - > - > - > - > - >; + using OPENMP_EXEC_2_POL = RAJA::KernelPolicy< + // + // (0) Execution policies for outer loops + // These loops iterate over the number of + // tiles needed to carry out the transpose + // + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem< + RAJA::cpu_tile_mem, + RAJA::ParamList<2>, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<1>, + RAJA::omp_parallel_for_exec, + RAJA::statement::ForICount<0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::Lambda<0>>>, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::Lambda<1>>>>>>>; RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = Aview(row, col); - - }, + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Atview(col, row) = Tile_Array(ty, tx); + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, - } - ); + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_r, N_c); @@ -391,87 +419,89 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); - using CUDA_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< + using CUDA_EXEC_POL = RAJA::KernelPolicy, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, - // This statement will initalize local array memory inside a - // kernel. The cpu_tile_mem policy specifies that memory should be - // allocated on the stack. The entries in the RAJA::ParamList - // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, - // - // (1) Execution policies for the first set of inner - // loops. These loops copy data from the global matrices - // to the local tile. - // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<0> - > - >, - // Synchronize threads to ensure all loads - // to the local array are complete - RAJA::statement::CudaSyncThreads, - // - // (2) Execution policies for the second set of inner - // loops. These loops copy data from the local tile to - // the global matrix. - // Note: The order of the loops have been - // swapped! This enables us to swap which - // index has unit stride. - // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<1> - > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::CudaSyncThreads - > - > - > - > - >; + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem< + RAJA::cuda_shared_mem, + RAJA::ParamList<2>, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<0>, + RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<0, + RAJA::statement::Param<1>, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0>>>, + // Synchronize threads to ensure all loads + // to the local array are complete + RAJA::statement::CudaSyncThreads, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<1>, + RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<1, + RAJA::statement::Param<0>, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<1>>>, + // Synchronize threads to ensure all reads + // from the local array are complete + RAJA::statement::CudaSyncThreads>>>>>; RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = Aview(row, col); - - }, + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Atview(col, row) = Tile_Array(ty, tx); + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, - } - ); + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -483,138 +513,154 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); - using HIP_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< + using HIP_EXEC_POL = RAJA::KernelPolicy, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, - // This statement will initalize local array memory inside a - // kernel. The cpu_tile_mem policy specifies that memory should be - // allocated on the stack. The entries in the RAJA::ParamList - // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, - // - // (1) Execution policies for the first set of inner - // loops. These loops copy data from the global matrices - // to the local tile. - // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct, - RAJA::statement::Lambda<0> - > - >, - // Synchronize threads to ensure all loads - // to the local array are complete - RAJA::statement::HipSyncThreads, - // - // (2) Execution policies for the second set of inner - // loops. These loops copy data from the local tile to - // the global matrix. - // Note: The order of the loops have been - // swapped! This enables us to swap which - // index has unit stride. - // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct, - RAJA::statement::Lambda<1> - > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::HipSyncThreads - > - > - > - > - >; + RAJA::statement::Tile< + 1, + RAJA::tile_fixed, + RAJA::hip_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_loop, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem< + RAJA::hip_shared_mem, + RAJA::ParamList<2>, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<0>, + RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<0, + RAJA::statement::Param<1>, + RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<0>>>, + // Synchronize threads to ensure all loads + // to the local array are complete + RAJA::statement::HipSyncThreads, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<1>, + RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<1, + RAJA::statement::Param<0>, + RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<1>>>, + // Synchronize threads to ensure all reads + // from the local array are complete + RAJA::statement::HipSyncThreads>>>>>; RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = d_Aview(row, col); + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - }, + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Tile_Array(ty, tx) = d_Aview(row, col); + }, - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - d_Atview(col, row) = Tile_Array(ty, tx); - - } - ); + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + d_Atview(col, row) = Tile_Array(ty, tx); + }); - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n"; + std::cout << "\n Running RAJA - sequential matrix transpose exercise with " + "args in statement ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - //Alias for convenience - using RAJA::Segs; + // Alias for convenience using RAJA::Offsets; using RAJA::Params; + using RAJA::Segs; // _raja_mattranspose_lambdaargs_start - using SEQ_EXEC_POL_II = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - - RAJA::statement::InitLocalMem, - - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> > - > - >, - - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> > - > - > - - > - > - > - >; - - RAJA::kernel_param( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - - RAJA::make_tuple(Tile_Array), - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + using SEQ_EXEC_POL_II = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + + RAJA::statement::InitLocalMem< + RAJA::cpu_tile_mem, + RAJA::ParamList<0>, + + RAJA::statement::For< + 1, + RAJA::seq_exec, + RAJA::statement::For<0, + RAJA::seq_exec, + RAJA::statement::Lambda<0, + Segs<0>, + Segs<1>, + Offsets<0>, + Offsets<1>, + Params<0>>>>, + + RAJA::statement::For< + 0, + RAJA::seq_exec, + RAJA::statement::For< + 1, + RAJA::seq_exec, + RAJA::statement:: + Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0>>>> + + >>>>; + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + + RAJA::make_tuple(Tile_Array), + + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { Tile_Array(ty, tx) = Aview(row, col); - }, + }, - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Atview(col, row) = Tile_Array(ty, tx); - } - ); + [=](int col, int row, int tx, int ty, TILE_MEM& Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + }); // _raja_mattranspose_lambdaargs_start checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } @@ -627,16 +673,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -648,8 +700,10 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) << std::endl; } diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp index 7316563117..8ab08df5d1 100644 --- a/exercises/kernel-matrix-transpose-tiled.cpp +++ b/exercises/kernel-matrix-transpose-tiled.cpp @@ -21,14 +21,14 @@ * transposed and returned as a second matrix At. * * This operation is carried out using a tiling algorithm. - * The algorithm iterates over tiles of the matrix A and + * The algorithm iterates over tiles of the matrix A and * performs a transpose copy without explicitly storing the tile. * * The algorithm is expressed as a collection of ``outer`` - * and ``inner`` for loops. Iterations of the inner loop will + * and ``inner`` for loops. Iterations of the inner loop will * tranpose tile entries; while outer loops will iterate over * the number of tiles needed to carryout the transpose. - * We do not assume that tiles divide the number of rows and + * We do not assume that tiles divide the number of rows and * and columns of the matrix. * * RAJA features shown: @@ -56,7 +56,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; @@ -110,24 +112,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end @@ -138,12 +144,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Further partioning of the iteration space is carried out in the + // Further partioning of the iteration space is carried out in the // tile_fixed statements. Iterations inside a RAJA loop is given by their - // global iteration number. + // global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -154,7 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. The template parameter inside + // using sequential loops. The template parameter inside // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start @@ -168,23 +174,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// NOTE: We have done this first one for you. /// - using TILED_KERNEL_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::For< + 1, + RAJA::seq_exec, + RAJA::statement:: + For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); // _raja_tiled_mattranspose_end checkResult(Atview, N_c, N_r); @@ -192,7 +198,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -214,7 +221,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, + row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -224,7 +232,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// - std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running openmp tiled matrix transpose - collapsed inner " + "loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -233,35 +242,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // into a single OpenMP parallel for loop enabling parallel loads/reads // to/from the tile. // - using TILED_KERNEL_EXEC_POL_OMP2 = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Collapse, - RAJA::statement::Lambda<0> - > //closes collapse - > // closes Tile 0 - > // closes Tile 1 - >; // closes policy list - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile<0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::Collapse< + RAJA::omp_parallel_collapse_exec, + RAJA::ArgList<0, 1>, + RAJA::statement::Lambda<0>> // closes collapse + > // closes Tile 0 + > // closes Tile 1 + >; // closes policy list + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start /// @@ -277,7 +288,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, + row_Range), [=] RAJA_DEVICE (int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -285,47 +297,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; - int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); int* d_At = memoryManager::allocate_gpu(N_r * N_c); RAJA::View> d_Aview(d_A, N_r, N_c); RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); using TILED_KERNEL_EXEC_POL_HIP = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, - RAJA::statement::For<1, RAJA::hip_thread_x_direct, - RAJA::statement::For<0, RAJA::hip_thread_y_direct, - RAJA::statement::Lambda<0> - > - > - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - d_Atview(col, row) = d_Aview(row, col); - }); - - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + RAJA::KernelPolicy, + RAJA::hip_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_loop, + RAJA::statement::For< + 1, + RAJA::hip_thread_x_direct, + RAJA::statement::For<0, + RAJA::hip_thread_y_direct, + RAJA::statement::Lambda<0>>>>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE(int col, int row) { + d_Atview(col, row) = d_Aview(row, col); + }); + + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// @@ -340,7 +356,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. @@ -349,16 +365,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -370,11 +392,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; @@ -110,24 +112,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end @@ -138,12 +144,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Further partioning of the iteration space is carried out in the + // Further partioning of the iteration space is carried out in the // tile_fixed statements. Iterations inside a RAJA loop is given by their - // global iteration number. + // global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -154,27 +160,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. The template parameter inside + // using sequential loops. The template parameter inside // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start - using TILED_KERNEL_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::For< + 1, + RAJA::seq_exec, + RAJA::statement:: + For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); // _raja_tiled_mattranspose_end checkResult(Atview, N_c, N_r); @@ -182,7 +188,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -190,29 +197,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops over tiles sequentially while exposing parallelism on // one of the inner loops. // - using TILED_KERNEL_EXEC_POL_OMP = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using TILED_KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy, + RAJA::omp_parallel_for_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::For< + 1, + RAJA::omp_parallel_for_exec, + RAJA::statement:: + For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// - std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running openmp tiled matrix transpose - collapsed inner " + "loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -221,99 +229,107 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // into a single OpenMP parallel for loop enabling parallel loads/reads // to/from the tile. // - using TILED_KERNEL_EXEC_POL_OMP2 = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Collapse, - RAJA::statement::Lambda<0> - > //closes collapse - > // closes Tile 0 - > // closes Tile 1 - >; // closes policy list - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile<0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::Collapse< + RAJA::omp_parallel_collapse_exec, + RAJA::ArgList<0, 1>, + RAJA::statement::Lambda<0>> // closes collapse + > // closes Tile 0 + > // closes Tile 1 + >; // closes policy list + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start - using TILED_KERNEL_EXEC_POL_CUDA = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, - RAJA::statement::For<1, RAJA::cuda_thread_x_direct, - RAJA::statement::For<0, RAJA::cuda_thread_y_direct, - RAJA::statement::Lambda<0> - > - > - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using TILED_KERNEL_EXEC_POL_CUDA = + RAJA::KernelPolicy, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + RAJA::statement::For< + 1, + RAJA::cuda_thread_x_direct, + RAJA::statement::For<0, + RAJA::cuda_thread_y_direct, + RAJA::statement::Lambda<0>>>>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE(int col, int row) { + Atview(col, row) = Aview(row, col); + }); // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; - int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); int* d_At = memoryManager::allocate_gpu(N_r * N_c); RAJA::View> d_Aview(d_A, N_r, N_c); RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); using TILED_KERNEL_EXEC_POL_HIP = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, - RAJA::statement::For<1, RAJA::hip_thread_x_direct, - RAJA::statement::For<0, RAJA::hip_thread_y_direct, - RAJA::statement::Lambda<0> - > - > - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - d_Atview(col, row) = d_Aview(row, col); - }); - - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + RAJA::KernelPolicy, + RAJA::hip_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_loop, + RAJA::statement::For< + 1, + RAJA::hip_thread_x_direct, + RAJA::statement::For<0, + RAJA::hip_thread_y_direct, + RAJA::statement::Lambda<0>>>>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE(int col, int row) { + d_Atview(col, row) = d_Aview(row, col); + }); + + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// @@ -328,7 +344,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. @@ -337,16 +353,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -358,11 +380,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -104,13 +108,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // -//RAJA::TypedRangeSegment row_Range(0, N_r); -//RAJA::TypedRangeSegment col_Range(0, N_c); + // RAJA::TypedRangeSegment row_Range(0, N_r); + // RAJA::TypedRangeSegment col_Range(0, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running sequential matrix transpose ...\n"; @@ -118,7 +122,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start @@ -127,9 +131,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a /// basic matrix transpose. - /// + /// /// Uncomment 'row_Range' and 'col_Range' objects above so they - /// can be used in the kernel. + /// can be used in the kernel. /// /// @@ -149,7 +153,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -163,9 +168,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a /// basic matrix transpose. - /// + /// /// Uncomment 'row_Range' and 'col_Range' objects above so they - /// can be used in the kernel. + /// can be used in the kernel. /// /// @@ -174,7 +179,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -183,13 +188,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start /// @@ -197,9 +202,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a /// basic matrix transpose. - /// + /// /// Uncomment 'row_Range' and 'col_Range' objects above so they - /// can be used in the kernel. + /// can be used in the kernel. /// /// @@ -208,7 +213,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Uncomment this code block. /// /* - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=] RAJA_DEVICE (int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -216,10 +221,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -230,7 +235,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. @@ -239,16 +244,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -260,11 +271,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -104,10 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -118,29 +122,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start - using KERNEL_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using KERNEL_EXEC_POL = RAJA::KernelPolicy>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); // _raja_mattranspose_end checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -148,53 +148,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops sequentially while exposing parallelism on // one of the inner loops. // - using KERNEL_EXEC_POL_OMP = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start - using KERNEL_EXEC_POL_CUDA = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<1, RAJA::cuda_thread_x_loop, - RAJA::statement::For<0, RAJA::cuda_thread_y_loop, - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using KERNEL_EXEC_POL_CUDA = + RAJA::KernelPolicy>>>>; + + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE(int col, int row) { + Atview(col, row) = Aview(row, col); + }); // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -205,7 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. @@ -214,16 +205,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -235,11 +232,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,64 +59,71 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// + // _init_define_start + // + // 3D tensor has N^3 entries + // constexpr int N = 100; constexpr int N_tot = N * N * N; constexpr double c = 0.0001; double* a = memoryManager::allocate(N_tot); double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; @@ -135,110 +143,105 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_outer_start - using EXEC_POL2 = - RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), - - [=]( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_omp_outer_end + // _raja_tensorinit_omp_outer_start + using EXEC_POL2 = RAJA::KernelPolicy>>>>; + + RAJA::kernel( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), + + [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_collapse_start - #pragma omp parallel for collapse(3) - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_collapse_start +#pragma omp parallel for collapse(3) + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_collapse_end + // _cstyle_tensorinit_omp_collapse_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_collapse_start - using EXEC_POL3 = - RAJA::KernelPolicy< + // _raja_tensorinit_omp_collapse_start + using EXEC_POL3 = RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j, i - RAJA::statement::Lambda<0> - > - >; - - RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), - - [=]( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_omp_collapse_end + RAJA::ArgList<2, 1, 0>, // k, j, i + RAJA::statement::Lambda<0>>>; + + RAJA::kernel( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), + + [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_omp_collapse_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n"; @@ -262,43 +265,38 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start + // _raja_tensorinit_cuda_start using EXEC_POL5 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k - RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i - RAJA::statement::Lambda<0> - > - > - > - > - >; + RAJA::KernelPolicy>>>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=] __device__ ( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_cuda_end + [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; @@ -308,56 +306,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; -// _cuda_blockdim_end - -// _raja_tensorinit_cuda_tiled_direct_start - using EXEC_POL6 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::cuda_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::cuda_block_x_direct, - RAJA::statement::For<2, RAJA::cuda_block_z_direct, // k - RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i - RAJA::statement::Lambda<0> - > - > - > - > - > - > - >; + // _cuda_blockdim_end + + // _raja_tensorinit_cuda_tiled_direct_start + using EXEC_POL6 = RAJA::KernelPolicy, + RAJA::cuda_block_y_direct, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_direct, + RAJA::statement::For< + 2, + RAJA::cuda_block_z_direct, // k + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_direct, // j + RAJA::statement::For<0, + RAJA::cuda_thread_x_direct, // i + RAJA::statement::Lambda<0>>>>>>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=] __device__ ( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_cuda_tiled_direct_end + [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_cuda_tiled_direct_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), @@ -365,10 +361,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); @@ -377,51 +373,47 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_device_view_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_device_view_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start + // _raja_tensorinit_hip_start using EXEC_POL7 = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k - RAJA::statement::For<1, RAJA::hip_thread_y_loop, // j - RAJA::statement::For<0, RAJA::hip_thread_x_loop, // i - RAJA::statement::Lambda<0> - > - > - > - > - >; - - RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), - - [=] __device__ ( int i, int j, int k) { - d_aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_hip_end - - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + RAJA::KernelPolicy>>>>>; + + RAJA::kernel(RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), + + [=] __device__(int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k; + }); + // _raja_tensorinit_hip_end + + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; @@ -435,47 +427,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); - -// _raja_tensorinit_hip_tiled_direct_start - using EXEC_POL8 = - RAJA::KernelPolicy< - RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::hip_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::hip_block_x_direct, - RAJA::statement::For<2, RAJA::hip_block_z_direct, // k - RAJA::statement::For<1, RAJA::hip_thread_y_direct, // j - RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i - RAJA::statement::Lambda<0> - > - > - > - > - > - > - >; - - RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), - - [=] __device__ ( int i, int j, int k) { - d_aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_hip_tiled_direct_end - - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); + + // _raja_tensorinit_hip_tiled_direct_start + using EXEC_POL8 = RAJA::KernelPolicy, + RAJA::hip_block_y_direct, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_direct, + RAJA::statement::For< + 2, + RAJA::hip_block_z_direct, // k + RAJA::statement::For< + 1, + RAJA::hip_thread_y_direct, // j + RAJA::statement::For<0, + RAJA::hip_thread_x_direct, // i + RAJA::statement::Lambda<0>>>>>>>>; + + RAJA::kernel(RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), + + [=] __device__(int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k; + }); + // _raja_tensorinit_hip_tiled_direct_end + + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); #endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -494,14 +485,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp index c5041e01a9..9fecb8bfe9 100644 --- a/exercises/kernelintro-execpols_solution.cpp +++ b/exercises/kernelintro-execpols_solution.cpp @@ -37,16 +37,17 @@ #if defined(RAJA_ENABLE_CUDA) // _cuda_tensorinit_kernel_start -template< int i_block_size, int j_block_size, int k_block_size > -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,228 +59,219 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// + // _init_define_start + // + // 3D tensor has N^3 entries + // constexpr int N = 100; constexpr int N_tot = N * N * N; constexpr double c = 0.0001; double* a = memoryManager::allocate(N_tot); double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_seq_start - using EXEC_POL1 = - RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::seq_exec, // k - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec,// i - RAJA::statement::Lambda<0> - > - > - > - >; + // _raja_tensorinit_seq_start + using EXEC_POL1 = RAJA::KernelPolicy>>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=]( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_seq_end + [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_seq_end checkResult(a, a_ref, N_tot); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_outer_start - using EXEC_POL2 = - RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::Lambda<0> - > - > - > - >; + // _raja_tensorinit_omp_outer_start + using EXEC_POL2 = RAJA::KernelPolicy>>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=]( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_omp_outer_end + [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_collapse_start - #pragma omp parallel for collapse(3) - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_collapse_start +#pragma omp parallel for collapse(3) + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_collapse_end + // _cstyle_tensorinit_omp_collapse_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_collapse_start - using EXEC_POL3 = - RAJA::KernelPolicy< + // _raja_tensorinit_omp_collapse_start + using EXEC_POL3 = RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j, i - RAJA::statement::Lambda<0> - > - >; + RAJA::ArgList<2, 1, 0>, // k, j, i + RAJA::statement::Lambda<0>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=]( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_omp_collapse_end + [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_omp_collapse_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_collapse_start - using EXEC_POL4 = - RAJA::KernelPolicy< - RAJA::statement::Collapse, // k, j - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::Lambda<0> - > - > - >; + // _raja_tensorinit_omp_collapse_start + using EXEC_POL4 = RAJA::KernelPolicy, // k, j + RAJA::statement::For<0, + RAJA::seq_exec, // i + RAJA::statement::Lambda<0>>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=]( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_omp_collapse_end + [=](int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_omp_collapse_end checkResult(a, a_ref, N_tot); @@ -288,43 +280,38 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start + // _raja_tensorinit_cuda_start using EXEC_POL5 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k - RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i - RAJA::statement::Lambda<0> - > - > - > - > - >; + RAJA::KernelPolicy>>>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=] __device__ ( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_cuda_end + [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; @@ -334,56 +321,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; -// _cuda_blockdim_end - -// _raja_tensorinit_cuda_tiled_direct_start - using EXEC_POL6 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::cuda_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::cuda_block_x_direct, - RAJA::statement::For<2, RAJA::cuda_block_z_direct, // k - RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i - RAJA::statement::Lambda<0> - > - > - > - > - > - > - >; + // _cuda_blockdim_end + + // _raja_tensorinit_cuda_tiled_direct_start + using EXEC_POL6 = RAJA::KernelPolicy, + RAJA::cuda_block_y_direct, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_direct, + RAJA::statement::For< + 2, + RAJA::cuda_block_z_direct, // k + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_direct, // j + RAJA::statement::For<0, + RAJA::cuda_thread_x_direct, // i + RAJA::statement::Lambda<0>>>>>>>>; RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), - [=] __device__ ( int i, int j, int k) { - aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_cuda_tiled_direct_end + [=] __device__(int i, int j, int k) { aView(i, j, k) = c * i * j * k; }); + // _raja_tensorinit_cuda_tiled_direct_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), @@ -391,10 +376,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); @@ -403,51 +388,47 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_device_view_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_device_view_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start + // _raja_tensorinit_hip_start using EXEC_POL7 = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k - RAJA::statement::For<1, RAJA::hip_thread_y_loop, // j - RAJA::statement::For<0, RAJA::hip_thread_x_loop, // i - RAJA::statement::Lambda<0> - > - > - > - > - >; - - RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), - - [=] __device__ ( int i, int j, int k) { - d_aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_hip_end - - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + RAJA::KernelPolicy>>>>>; + + RAJA::kernel(RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), + + [=] __device__(int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k; + }); + // _raja_tensorinit_hip_end + + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; @@ -461,47 +442,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); - -// _raja_tensorinit_hip_tiled_direct_start - using EXEC_POL8 = - RAJA::KernelPolicy< - RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::hip_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::hip_block_x_direct, - RAJA::statement::For<2, RAJA::hip_block_z_direct, // k - RAJA::statement::For<1, RAJA::hip_thread_y_direct, // j - RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i - RAJA::statement::Lambda<0> - > - > - > - > - > - > - >; - - RAJA::kernel( - RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N), - RAJA::TypedRangeSegment(0, N) ), - - [=] __device__ ( int i, int j, int k) { - d_aView(i, j, k) = c * i * j * k ; - } - ); -// _raja_tensorinit_hip_tiled_direct_end - - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); + + // _raja_tensorinit_hip_tiled_direct_start + using EXEC_POL8 = RAJA::KernelPolicy, + RAJA::hip_block_y_direct, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::hip_block_x_direct, + RAJA::statement::For< + 2, + RAJA::hip_block_z_direct, // k + RAJA::statement::For< + 1, + RAJA::hip_thread_y_direct, // j + RAJA::statement::For<0, + RAJA::hip_thread_x_direct, // i + RAJA::statement::Lambda<0>>>>>>>>; + + RAJA::kernel(RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), + + [=] __device__(int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k; + }); + // _raja_tensorinit_hip_tiled_direct_end + + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); #endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -520,14 +500,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp index 406ea7e581..18d6bc5e3f 100644 --- a/exercises/kernelintro-nested-loop-reorder.cpp +++ b/exercises/kernelintro-nested-loop-reorder.cpp @@ -14,10 +14,10 @@ * Nested Loop Basics and Loop Reordering (RAJA::kernel) * * In this exercise, we introduce basic RAJA::kernel mechanics for executing - * nested loop kernels, including using execution policies to permute the - * order of loops in a loop nest. The exercise performs no actual + * nested loop kernels, including using execution policies to permute the + * order of loops in a loop nest. The exercise performs no actual * computation and just prints out loop indices to show different - * loop ordering. Also, to avoid difficulty in interpreting parallel + * loop ordering. Also, to avoid difficulty in interpreting parallel * output, the execution policies use sequential execution. * * RAJA features shown: @@ -28,18 +28,18 @@ // // Define three named loop index integer types used in the triply-nested loops. -// These will trigger compilation errors if lambda index argument ordering +// These will trigger compilation errors if lambda index argument ordering // and types do not match the typed range index ordering. See final // example in this file. // // _raja_typed_indices_start RAJA_INDEX_VALUE_T(KIDX, int, "KIDX"); -RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); -RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); +RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); +RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); // _raja_typed_indices_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { // _range_min_max_start @@ -51,117 +51,141 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int kmax = 4; // _range_min_max_end -// -// The RAJA variants of the loop nest use the following typed range segments -// based on the typed indices defined above, outside of main(). -// + // + // The RAJA variants of the loop nest use the following typed range segments + // based on the typed indices defined above, outside of main(). + // // _raja_typed_index_ranges_start RAJA::TypedRangeSegment KRange(kmin, kmax); RAJA::TypedRangeSegment JRange(jmin, jmax); RAJA::TypedRangeSegment IRange(imin, imax); // _raja_typed_index_ranges_end - + std::cout << "\n\nRAJA::kernel nested loop reorder example...\n"; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + std::cout << "\n Running C-style nested loop order: K-outer, J-middle, " + "I-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_kji_loops_start - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_kji_loops_end -//----------------------------------------------------------------------------// - - std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + + std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, " + "I-inner)" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_kji_loops_start - using KJI_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::seq_exec, // k - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec,// i - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + using KJI_EXECPOL = RAJA::KernelPolicy>>>>; + + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) { + printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); + }); // _raja_kji_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: J-outer, I-middle, " + "K-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_jik_loops_start - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_jik_loops_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; /// /// TODO... /// - /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, + /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, /// i on middle loop, and k on inner loop /// -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: I-outer, K-middle, " + "J-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_ikj_loops_start - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_ikj_loops_end -//----------------------------------------------------------------------------// - + //----------------------------------------------------------------------------// + std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; /// /// TODO... /// - /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, + /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, /// k on middle loop, and j on inner loop /// -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - -#if 0 // Enable this code block to generate compiler error. + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + +#if 0 // Enable this code block to generate compiler error. //----------------------------------------------------------------------------// // The following demonstrates that code will not compile if lambda argument // types/order do not match the types/order For statements in the execution @@ -181,4 +205,3 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) return 0; } - diff --git a/exercises/kernelintro-nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp index 9df3ff4657..de28c08e67 100644 --- a/exercises/kernelintro-nested-loop-reorder_solution.cpp +++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp @@ -14,10 +14,10 @@ * Nested Loop Basics and Loop Reordering (RAJA::kernel) * * In this exercise, we introduce basic RAJA::kernel mechanics for executing - * nested loop kernels, including using execution policies to permute the - * order of loops in a loop nest. The exercise performs no actual + * nested loop kernels, including using execution policies to permute the + * order of loops in a loop nest. The exercise performs no actual * computation and just prints out loop indices to show different - * loop ordering. Also, to avoid difficulty in interpreting parallel + * loop ordering. Also, to avoid difficulty in interpreting parallel * output, the execution policies use sequential execution. * * RAJA features shown: @@ -28,18 +28,18 @@ // // Define three named loop index integer types used in the triply-nested loops. -// These will trigger compilation errors if lambda index argument ordering +// These will trigger compilation errors if lambda index argument ordering // and types do not match the typed range index ordering. See final // example in this file. // // _raja_typed_indices_start RAJA_INDEX_VALUE_T(KIDX, int, "KIDX"); -RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); -RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); +RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); +RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); // _raja_typed_indices_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { // _range_min_max_start @@ -51,137 +51,159 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int kmax = 4; // _range_min_max_end -// -// The RAJA variants of the loop nest use the following typed range segments -// based on the typed indices defined above, outside of main(). -// + // + // The RAJA variants of the loop nest use the following typed range segments + // based on the typed indices defined above, outside of main(). + // // _raja_typed_index_ranges_start RAJA::TypedRangeSegment KRange(kmin, kmax); RAJA::TypedRangeSegment JRange(jmin, jmax); RAJA::TypedRangeSegment IRange(imin, imax); // _raja_typed_index_ranges_end - + std::cout << "\n\nRAJA::kernel nested loop reorder example...\n"; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + std::cout << "\n Running C-style nested loop order: K-outer, J-middle, " + "I-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_kji_loops_start - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_kji_loops_end -//----------------------------------------------------------------------------// - - std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + + std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, " + "I-inner)" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_kji_loops_start - using KJI_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::seq_exec, // k - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec,// i - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + using KJI_EXECPOL = RAJA::KernelPolicy>>>>; + + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) { + printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); + }); // _raja_kji_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: J-outer, I-middle, " + "K-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_jik_loops_start - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_jik_loops_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_jik_loops_start - using JIK_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::For<2, RAJA::seq_exec,// k - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + using JIK_EXECPOL = RAJA::KernelPolicy>>>>; + + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) { + printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); + }); // _raja_jik_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: I-outer, K-middle, " + "J-inner" + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _cstyle_ikj_loops_start - for (int i = imin; i < imax; ++i) { - for (int k = kmin; k < kmax; ++k) { - for (int j = jmin; j < jmax; ++j) { - printf( " (%d, %d, %d) \n", i, j, k); + for (int i = imin; i < imax; ++i) + { + for (int k = kmin; k < kmax; ++k) + { + for (int j = jmin; j < jmax; ++j) + { + printf(" (%d, %d, %d) \n", i, j, k); } } } // _cstyle_ikj_loops_end -//----------------------------------------------------------------------------// - + //----------------------------------------------------------------------------// + std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + << "...\n\n" + << " (I, J, K)\n" + << " ---------\n"; // _raja_ikj_loops_start - using IKJ_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::For<2, RAJA::seq_exec, // k - RAJA::statement::For<1, RAJA::seq_exec,// j - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); + using IKJ_EXECPOL = RAJA::KernelPolicy>>>>; + + RAJA::kernel( + RAJA::make_tuple(IRange, JRange, KRange), [=](IIDX i, JIDX j, KIDX k) { + printf(" (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); + }); // _raja_ikj_loops_end -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// - -#if 0 // Enable this code block to generate compiler error. + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + +#if 0 // Enable this code block to generate compiler error. //----------------------------------------------------------------------------// // The following demonstrates that code will not compile if lambda argument // types/order do not match the types/order For statements in the execution @@ -201,4 +223,3 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) return 0; } - diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp index eea48d073a..8bcab8dc1e 100644 --- a/exercises/launch-matrix-transpose-local-array.cpp +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -50,7 +50,7 @@ // Define dimensionality of matrices and tile size // const int DIM = 2; -#define TILE_DIM (16) // #define to appease msvc +#define TILE_DIM (16) // #define to appease msvc // // Function for checking results @@ -65,7 +65,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose example...\n"; @@ -84,8 +84,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -101,8 +101,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -117,8 +119,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -129,14 +133,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -148,19 +155,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -179,34 +188,38 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using launch_policy_1 = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - - /// - /// TODO ... - /// - /// Exercise Implement loop_icount methods to load tiles of the - /// input matrix into the RAJA_TEAM_SHARED memory array - /// - - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - - Atview(col, row) = Tile_Array[ty][tx]; - + RAJA::LaunchParams(), // LaunchParams may be empty when only running on + // the cpu + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_r), + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + /// + /// TODO ... + /// + /// Exercise Implement loop_icount methods to load tiles of + /// the input matrix into the RAJA_TEAM_SHARED memory array + /// + + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + Atview(col, row) = Tile_Array[ty][tx]; + }); + }); + }); }); - }); - - }); }); - - }); // _mattranspose_localarray_raja_end checkResult(Atview, N_c, N_r); @@ -231,39 +244,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// within the omp parallel region. /// - //using loop_pol_2 = RAJA::LoopPolicy; + // using loop_pol_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { + RAJA::LaunchParams(), // LaunchParams may be empty when only running on + // the cpu + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) { + /* + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, + N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment + const &col_tile) { - /* - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) + { RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = Aview(row, col); + Tile_Array[ty][tx] = Aview(row, col); + }); }); - }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) + { RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(col, row) = Tile_Array[ty][tx]; + }); }); - }); + }); }); - }); - */ - }); + */ + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -282,56 +298,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// - /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads directly + /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads + /// directly /// const bool cuda_async = false; - using cuda_launch_policy = RAJA::LaunchPolicy>; + using cuda_launch_policy = + RAJA::LaunchPolicy>; RAJA::launch( - RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { - /* - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) { + /* + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, + N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment + const &col_tile) { - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int + ty) { RAJA::loop_icount(ctx, col_tile, [&] (int col, int + tx) { - Tile_Array[ty][tx] = Aview(row, col); + Tile_Array[ty][tx] = Aview(row, col); + }); }); - }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int + tx) { RAJA::loop_icount(ctx, row_tile, [&] (int row, int + ty) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(col, row) = Tile_Array[ty][tx]; + }); }); - }); + }); }); + */ }); - */ - }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -343,8 +366,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; @@ -360,44 +384,48 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = d_Aview(row, col); - - }); - }); - - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - - d_Atview(col, row) = Tile_Array[ty][tx]; - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_r), + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + Tile_Array[ty][tx] = d_Aview(row, col); + }); + }); + + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + d_Atview(col, row) = Tile_Array[ty][tx]; + }); + }); + }); }); - }); - - }); }); - }); - - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } @@ -410,16 +438,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -431,8 +465,10 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) << std::endl; } diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index fe2d41ecec..e388a58848 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -50,7 +50,7 @@ // Define dimensionality of matrices and tile size // const int DIM = 2; -#define TILE_DIM (16) // #define to appease msvc +#define TILE_DIM (16) // #define to appease msvc // // Function for checking results @@ -65,7 +65,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA shared matrix transpose example...\n"; @@ -84,8 +84,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -101,8 +101,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } @@ -117,8 +119,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // Stack-allocated local array for data on a tile int Tile[TILE_DIM][TILE_DIM]; @@ -129,14 +133,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[ty][tx] = Aview(row, col); } } @@ -148,19 +155,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) + { + for (int ty = 0; ty < TILE_DIM; ++ty) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[ty][tx]; } } } - } } // _mattranspose_localarray_cstyle_end @@ -179,35 +188,39 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using launch_policy_1 = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = Aview(row, col); - + RAJA::LaunchParams(), // LaunchParams may be empty when only running on + // the cpu + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_r), + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + Tile_Array[ty][tx] = Aview(row, col); + }); + }); + + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + Atview(col, row) = Tile_Array[ty][tx]; + }); + }); + }); }); - }); - - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - - Atview(col, row) = Tile_Array[ty][tx]; - - }); - }); - - }); }); - - }); // _mattranspose_localarray_raja_end checkResult(Atview, N_c, N_r); @@ -229,36 +242,40 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using launch_policy_2 = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = Aview(row, col); - + RAJA::LaunchParams(), // LaunchParams may be empty when only running on + // the cpu + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_r), + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + Tile_Array[ty][tx] = Aview(row, col); + }); + }); + + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + Atview(col, row) = Tile_Array[ty][tx]; + }); + }); + }); }); - }); - - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - - Atview(col, row) = Tile_Array[ty][tx]; - - }); - }); - - }); }); - }); - checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -281,52 +298,56 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using cuda_threads_x = RAJA::LoopPolicy; const bool cuda_async = false; - using cuda_launch_policy = RAJA::LaunchPolicy>; + using cuda_launch_policy = + RAJA::LaunchPolicy>; RAJA::launch( - RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = Aview(row, col); - + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_r), + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + Tile_Array[ty][tx] = Aview(row, col); + }); + }); + + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + Atview(col, row) = Tile_Array[ty][tx]; + }); + }); + }); }); - }); - - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - - Atview(col, row) = Tile_Array[ty][tx]; - - }); - }); - - }); - }); - - }); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; - int *d_A = memoryManager::allocate_gpu(N_r * N_c); - int *d_At = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -338,8 +359,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; @@ -355,44 +377,48 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = d_Aview(row, col); - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_r), + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + RAJA::TypedRangeSegment(0, N_c), + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + Tile_Array[ty][tx] = d_Aview(row, col); + }); + }); + + RAJA::loop_icount( + ctx, col_tile, [&](int col, int tx) { + RAJA::loop_icount( + ctx, row_tile, [&](int row, int ty) { + d_Atview(col, row) = Tile_Array[ty][tx]; + }); + }); + }); }); - }); - - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - - d_Atview(col, row) = Tile_Array[ty][tx]; - - }); - }); - - }); - }); - - }); + }); - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// return 0; } @@ -405,16 +431,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -426,8 +458,10 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) << std::endl; } diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp index 1206cbc680..82e995eee3 100644 --- a/exercises/launch-matrix-transpose-tiled.cpp +++ b/exercises/launch-matrix-transpose-tiled.cpp @@ -56,7 +56,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA tiled matrix transpose example...\n"; @@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// @@ -111,24 +113,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end @@ -147,13 +153,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // global iteration number. // -/// -/// TODO: Uncomment these range segments so you can use them in the -/// non-HIP exercises in this file. -/* - RAJA::TypedRangeSegment row_Range(0, N_r); - RAJA::TypedRangeSegment col_Range(0, N_c); -*/ + /// + /// TODO: Uncomment these range segments so you can use them in the + /// non-HIP exercises in this file. + /* + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); + */ //----------------------------------------------------------------------------// std::cout << "\n Running sequential tiled matrix transpose ...\n"; @@ -165,38 +171,40 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start - //using loop_pol_1 = RAJA::LoopPolicy; + // using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { - - /* - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) { + /* + RAJA::tile(ctx, TILE_DIM, row_Range, [&] + (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range, [&] + (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + + /// + /// TODO... + /// + /// EXERCISE: Implement a loop method that takes a col_tile and + /// returns the global index to the column iteration + /// + /// Uncomment the statement below to run the kernel and check + the + /// result. + /// + + //Atview(col, row) = Aview(row, col); - RAJA::loop(ctx, row_tile, [&] (int row) { - - /// - /// TODO... - /// - /// EXERCISE: Implement a loop method that takes a col_tile and - /// returns the global index to the column iteration - /// - /// Uncomment the statement below to run the kernel and check the - /// result. - /// - - //Atview(col, row) = Aview(row, col); + }); }); - }); + */ }); - */ - }); // _raja_tiled_mattranspose_end checkResult(Atview, N_c, N_r); @@ -204,7 +212,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -212,26 +221,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops over tiles sequentially while exposing parallelism on // one of the inner loops. // - //using omp_for_pol_2 = RAJA::LoopPolicy; - //using loop_pol_2 = RAJA::LoopPolicy; + // using omp_for_pol_2 = RAJA::LoopPolicy; + // using loop_pol_2 = RAJA::LoopPolicy; /// /// TODO... /// /// EXERCISE: Create a launch_policy_2 that will create an omp parallel region /// - /// Uncomment the kernel below to run it and check the result. - /// - /// + /// Uncomment the kernel below to run it and check the result. + /// + /// /* RAJA::launch( RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, row_Range, [&] + (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, col_Range, [&] + (RAJA::TypedRangeSegment const &col_tile) { RAJA::loop(ctx, row_tile, [&] (int row) { RAJA::loop(ctx, col_tile, [&] (int col) { @@ -252,7 +263,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; @@ -277,39 +288,41 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// EXERCISE: Implement the cuda launch policy to dispatch the kernel below /// on the GPU /// - /// When you uncomment kernel code below, you will also need to + /// When you uncomment kernel code below, you will also need to /// uncomment variables above that are used within it. /// -/* - RAJA::launch( - RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + /* + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, row_Range, [&] + (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, col_Range, [&] + (RAJA::TypedRangeSegment const &col_tile) { - RAJA::loop(ctx, row_tile, [&] (int row) { - RAJA::loop(ctx, col_tile, [&] (int col) { + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { - Atview(col, row) = Aview(row, col); + Atview(col, row) = Aview(row, col); + }); }); - }); + }); }); - }); - }); -*/ + }); + */ checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; @@ -317,15 +330,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::TypedRangeSegment row_Range2(0, N_r); RAJA::TypedRangeSegment col_Range2(0, N_c); - int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); int* d_At = memoryManager::allocate_gpu(N_r * N_c); RAJA::View> d_Aview(d_A, N_r, N_c); RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; @@ -341,31 +355,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool hip_async = false; using hip_launch_policy = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, row_Range2, [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, col_Range2, [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA::loop(ctx, row_tile, [&] (int row) { - RAJA::loop(ctx, col_tile, [&] (int col) { - - Atview(col, row) = Aview(row, col); - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + row_Range2, + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + col_Range2, + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA::loop(ctx, row_tile, [&](int row) { + RAJA::loop(ctx, col_tile, [&](int col) { + Atview(col, row) = Aview(row, col); + }); + }); + }); }); - }); - - }); }); - }); - - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// @@ -389,16 +405,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -410,11 +432,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA tiled matrix transpose example...\n"; @@ -77,8 +77,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -94,12 +94,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// @@ -111,30 +113,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // - for (int ty = 0; ty < TILE_DIM; ++ty) { - for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) + { + for (int tx = 0; tx < TILE_DIM; ++tx) + { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } // _cstyle_tiled_mattranspose_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// // @@ -162,25 +168,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using loop_pol_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; - RAJA::launch(RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA::loop(ctx, row_tile, [&] (int row) { - RAJA::loop(ctx, col_tile, [&] (int col) { - - Atview(col, row) = Aview(row, col); - + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + row_Range, + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + col_Range, + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA::loop(ctx, row_tile, [&](int row) { + RAJA::loop(ctx, col_tile, [&](int col) { + Atview(col, row) = Aview(row, col); + }); + }); + }); }); - }); - - }); }); - - }); // _raja_tiled_mattranspose_end checkResult(Atview, N_c, N_r); @@ -188,7 +196,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -201,32 +210,33 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using launch_policy_2 = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA::loop(ctx, row_tile, [&] (int row) { - RAJA::loop(ctx, col_tile, [&] (int col) { - - Atview(col, row) = Aview(row, col); - + RAJA::LaunchParams(), // LaunchParams may be empty when running on the cpu + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + row_Range, + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + col_Range, + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA::loop(ctx, row_tile, [&](int row) { + RAJA::loop(ctx, col_tile, [&](int col) { + Atview(col, row) = Aview(row, col); + }); + }); + }); }); - }); - - }); }); - }); - checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda tiled matrix transpose ...\n"; @@ -237,7 +247,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int r_block_sz = TILE_DIM; const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); - + // _raja_mattranspose_cuda_start using cuda_teams_y = RAJA::LoopPolicy; using cuda_teams_x = RAJA::LoopPolicy; @@ -246,49 +256,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using cuda_threads_x = RAJA::LoopPolicy; const bool cuda_async = false; - using cuda_launch_policy = RAJA::LaunchPolicy>; + using cuda_launch_policy = + RAJA::LaunchPolicy>; RAJA::launch( - RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA::loop(ctx, row_tile, [&] (int row) { - RAJA::loop(ctx, col_tile, [&] (int col) { - - Atview(col, row) = Aview(row, col); - + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + row_Range, + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + col_Range, + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA::loop(ctx, row_tile, [&](int row) { + RAJA::loop(ctx, col_tile, [&](int col) { + Atview(col, row) = Aview(row, col); + }); + }); + }); }); - }); - - }); }); - - }); // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running hip tiled matrix transpose ...\n"; - int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_A = memoryManager::allocate_gpu(N_r * N_c); int* d_At = memoryManager::allocate_gpu(N_r * N_c); RAJA::View> d_Aview(d_A, N_r, N_c); RAJA::View> d_Atview(d_At, N_c, N_r); std::memset(At, 0, N_r * N_c * sizeof(int)); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice)); constexpr int c_block_sz = TILE_DIM; constexpr int r_block_sz = TILE_DIM; @@ -305,30 +318,32 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using hip_launch_policy = RAJA::LaunchPolicy>; RAJA::launch( - RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { - - RAJA::tile (ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { - - RAJA::loop(ctx, row_tile, [&] (int row) { - RAJA::loop(ctx, col_tile, [&] (int col) { - - d_Atview(col, row) = d_Aview(row, col); - - }); - }); - - }); - }); - - }); + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::tile( + ctx, + TILE_DIM, + row_Range, + [&](RAJA::TypedRangeSegment const& row_tile) { + RAJA::tile( + ctx, + TILE_DIM, + col_Range, + [&](RAJA::TypedRangeSegment const& col_tile) { + RAJA::loop(ctx, row_tile, [&](int row) { + RAJA::loop(ctx, col_tile, [&](int col) { + d_Atview(col, row) = d_Aview(row, col); + }); + }); + }); + }); + }); - hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost)); checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif //----------------------------------------------------------------------------// @@ -352,16 +367,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -373,11 +394,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -104,10 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -118,98 +122,90 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start using loop_policy_seq = RAJA::LoopPolicy; using launch_policy_seq = RAJA::LaunchPolicy; - RAJA::launch - (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, row_Range, [&] (int /*row*/) { - RAJA::loop(ctx, col_Range, [&] (int /*col*/) { - - /// TODO... - /// - /// EXERCISE: Implement the kernel body for the transpose operation - /// - + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, row_Range, [&](int /*row*/) { + RAJA::loop(ctx, col_Range, [&](int /*col*/) { + /// TODO... + /// + /// EXERCISE: Implement the kernel body for the transpose operation + /// + }); }); }); - - }); // _raja_mattranspose_end checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); // // This policy loops sequentially while exposing parallelism on // one of the inner loops. - - //uncomment to use in example below - //using loop_policy_omp = RAJA::LoopPolicy; - using launch_policy_omp = RAJA::LaunchPolicy; - - RAJA::launch(RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { - - - /// TODO... - /// - /// EXERCISE: Implement the loops to apply omp parallism and sequential - /// execution on the column and row loops respectively - /// - - //Atview(col, row) = Aview(row, col); + // uncomment to use in example below + // using loop_policy_omp = RAJA::LoopPolicy; + using launch_policy_omp = RAJA::LaunchPolicy; - }); + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) { + /// TODO... + /// + /// EXERCISE: Implement the loops to apply omp parallism and sequential + /// execution on the column and row loops respectively + /// + + // Atview(col, row) = Aview(row, col); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start using cuda_thread_x = RAJA::LoopPolicy; using cuda_thread_y = RAJA::LoopPolicy; - const bool async = false; //execute asynchronously + const bool async = false; // execute asynchronously using launch_policy_cuda = RAJA::LaunchPolicy>; RAJA::launch( - RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, row_Range, [&] (int row) { - RAJA::loop(ctx, col_Range, [&] (int col) { - + RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, row_Range, [&](int row) { + RAJA::loop(ctx, col_Range, [&](int col) { Atview(col, row) = Aview(row, col); - + }); }); }); - - }); // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -220,7 +216,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. @@ -229,16 +225,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -250,11 +252,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA matrix transpose exercise...\n"; @@ -60,8 +60,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -77,12 +77,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running C-version of matrix transpose...\n"; @@ -90,9 +92,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); // _cstyle_mattranspose_start - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - Atview(col, row) = Aview(row, col); + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + Atview(col, row) = Aview(row, col); } } // _cstyle_mattranspose_end @@ -104,10 +108,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following RAJA variants use the RAJA::kernel method to carryout the - // transpose. + // transpose. // // Here, we define RAJA range segments to establish the iteration spaces. - // Iterations inside a RAJA loop is given by their global iteration number. + // Iterations inside a RAJA loop is given by their global iteration number. // RAJA::TypedRangeSegment row_Range(0, N_r); RAJA::TypedRangeSegment col_Range(0, N_c); @@ -118,32 +122,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The following policy carries out the transpose - // using sequential loops. + // using sequential loops. // // _raja_mattranspose_start using loop_policy_seq = RAJA::LoopPolicy; using launch_policy_seq = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, row_Range, [&] (int row) { - RAJA::loop(ctx, col_Range, [&] (int col) { - + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, row_Range, [&](int row) { + RAJA::loop(ctx, col_Range, [&](int col) { Atview(col, row) = Aview(row, col); - + }); }); }); - - }); // _raja_mattranspose_end checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running openmp matrix transpose - parallel top inner " + "loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -155,56 +157,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using launch_policy_omp = RAJA::LaunchPolicy; RAJA::launch( - RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, row_Range, [&] (int row) { - RAJA::loop(ctx, col_Range, [&] (int col) { - + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, row_Range, [&](int row) { + RAJA::loop(ctx, col_Range, [&](int col) { Atview(col, row) = Aview(row, col); - + }); }); }); - }); - checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running cuda matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - + // _raja_mattranspose_cuda_start using cuda_thread_x = RAJA::LoopPolicy; using cuda_thread_y = RAJA::LoopPolicy; - const bool async = false; //execute asynchronously + const bool async = false; // execute asynchronously using launch_policy_cuda = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, row_Range, [&] (int row) { - RAJA::loop(ctx, col_Range, [&] (int col) { - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16, 16)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, row_Range, [&](int row) { + RAJA::loop(ctx, col_Range, [&](int col) { Atview(col, row) = Aview(row, col); - + }); }); }); - - }); // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); - //printResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // Clean up. @@ -215,7 +210,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n DONE!...\n"; return 0; -} +} // // Function to check result and report P/F. @@ -224,16 +219,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match = false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -245,11 +246,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,64 +59,71 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// + // _init_define_start + // + // 3D tensor has N^3 entries + // constexpr int N = 100; constexpr int N_tot = N * N * N; constexpr double c = 0.0001; double* a = memoryManager::allocate(N_tot); double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; @@ -129,50 +137,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// the tensor initialization kernel. /// -// _raja_tensorinit_seq_start - //using loop_policy_1 = RAJA::LoopPolicy; + // _raja_tensorinit_seq_start + // using loop_policy_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; - RAJA::launch - (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { - /* - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) { + /* + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] + (int k) { - //Add additional loop methods to complete the kernel + //Add additional loop methods to complete the kernel + }); + */ }); - */ - }); -// _raja_tensorinit_seq_end + // _raja_tensorinit_seq_end checkResult(a, a_ref, N_tot); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; @@ -186,61 +199,61 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// kernel that creates a parallel outer loop. /// -// _raja_tensorinit_omp_outer_start + // _raja_tensorinit_omp_outer_start /* using omp_policy_2 = RAJA::LoopPolicy; using loop_policy_2 = RAJA::LoopPolicy; */ using launch_policy_2 = RAJA::LaunchPolicy; - RAJA::launch - (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { - - //TODO: Use the omp_policy_2 to distribute loop iterations - //in a RAJA::loop method - /* - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { - + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext /*ctx*/) { + // TODO: Use the omp_policy_2 to distribute loop iterations + // in a RAJA::loop method + /* + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] + (int j) { RAJA::loop(ctx, RAJA::TypedRangeSegment(0, + N), [&] (int i) { - }); - }); - */ - }); -// _raja_tensorinit_omp_outer_end + }); + }); + */ + }); + // _raja_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); -// _cuda_blockdim_end + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); + // _cuda_blockdim_end -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start + // _raja_tensorinit_cuda_start using cuda_teams_z_3 = RAJA::LoopPolicy; using cuda_global_thread_y_3 = RAJA::LoopPolicy; using cuda_global_thread_x_3 = RAJA::LoopPolicy; @@ -248,34 +261,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_3 = false; using launch_policy_3 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { - - aView(i, j, k) = c * i * j * k ; - - }); - }); + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int j) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int i) { + aView(i, j, k) = c * i * j * k; + }); + }); + }); }); - }); -// _raja_tensorinit_cuda_end + // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_tiled_direct_start + // _raja_tensorinit_cuda_tiled_direct_start using cuda_teams_z_4 = RAJA::LoopPolicy; using cuda_teams_y_4 = RAJA::LoopPolicy; using cuda_teams_x_4 = RAJA::LoopPolicy; @@ -286,46 +299,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_4 = false; using launch_policy_4 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - - RAJA::tile - (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { - - RAJA::tile - (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { - - RAJA::loop(ctx, j_tile, [&] (int j) { - RAJA::loop(ctx, i_tile, [&] (int i) { - - aView(i, j, k) = c * i * j * k ; - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::tile( + ctx, + j_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& j_tile) { + RAJA::tile( + ctx, + i_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& i_tile) { + RAJA::loop(ctx, j_tile, [&](int j) { + RAJA::loop( + ctx, i_tile, [&](int i) { + aView(i, j, k) = c * i * j * k; + }); + }); + }); }); - }); - }); - }); - }); - }); -// _raja_tensorinit_cuda_tiled_direct_end + // _raja_tensorinit_cuda_tiled_direct_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), @@ -333,10 +346,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); @@ -353,27 +366,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_deviceview_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_deviceview_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start + // _raja_tensorinit_hip_start using hip_teams_z_5 = RAJA::LoopPolicy; using hip_global_thread_y_5 = RAJA::LoopPolicy; using hip_global_thread_x_5 = RAJA::LoopPolicy; @@ -381,36 +394,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_5 = false; using launch_policy_5 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { - - d_aView(i, j, k) = c * i * j * k ; - - }); - }); - }); - - }); -// _raja_tensorinit_hip_end + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int j) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int i) { + d_aView(i, j, k) = c * i * j * k; + }); + }); + }); + }); + // _raja_tensorinit_hip_end - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_tiled_direct_start + // _raja_tensorinit_hip_tiled_direct_start using hip_teams_z_6 = RAJA::LoopPolicy; using hip_teams_y_6 = RAJA::LoopPolicy; using hip_teams_x_6 = RAJA::LoopPolicy; @@ -421,42 +433,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_6 = false; using launch_policy_6 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - - RAJA::tile - (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { - - RAJA::tile - (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { - - RAJA::loop(ctx, j_tile, [&] (int j) { - RAJA::loop(ctx, i_tile, [&] (int i) { - - d_aView(i, j, k) = c * i * j * k ; - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::tile( + ctx, + j_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& j_tile) { + RAJA::tile( + ctx, + i_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& i_tile) { + RAJA::loop(ctx, j_tile, [&](int j) { + RAJA::loop( + ctx, i_tile, [&](int i) { + d_aView(i, j, k) = c * i * j * k; + }); + }); + }); }); - }); - }); - }); - }); - }); -// _raja_tensorinit_hip_tiled_direct_end + // _raja_tensorinit_hip_tiled_direct_end - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); #endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -475,14 +487,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp index 1bfff68acf..37da99f9f0 100644 --- a/exercises/launchintro-execpols_solution.cpp +++ b/exercises/launchintro-execpols_solution.cpp @@ -37,16 +37,17 @@ #if defined(RAJA_ENABLE_CUDA) // _cuda_tensorinit_kernel_start -template< int i_block_size, int j_block_size, int k_block_size > -__launch_bounds__(i_block_size*j_block_size*k_block_size) -__global__ void nested_init(double* a, double c, int N) +template +__launch_bounds__(i_block_size* j_block_size* k_block_size) __global__ + void nested_init(double* a, double c, int N) { int i = blockIdx.x * i_block_size + threadIdx.x; int j = blockIdx.y * j_block_size + threadIdx.y; int k = blockIdx.z; - if ( i < N && j < N && k < N ) { - a[i+N*(j+N*k)] = c * i * j * k ; + if (i < N && j < N && k < N) + { + a[i + N * (j + N * k)] = c * i * j * k; } } // _cuda_tensorinit_kernel_end @@ -58,174 +59,186 @@ __global__ void nested_init(double* a, double c, int N) void checkResult(double* a, double* aref, const int n); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; -// _init_define_start -// -// 3D tensor has N^3 entries -// + // _init_define_start + // + // 3D tensor has N^3 entries + // constexpr int N = 100; constexpr int N_tot = N * N * N; constexpr double c = 0.0001; double* a = memoryManager::allocate(N_tot); double* a_ref = memoryManager::allocate(N_tot); -// _init_define_end - -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; - -// _cstyle_tensorinit_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - a_ref[i+N*(j+N*k)] = c * i * j * k ; + // _init_define_end + + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference " + "solution ...\n"; + + // _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + a_ref[i + N * (j + N * k)] = c * i * j * k; } } } -// _cstyle_tensorinit_seq_end + // _cstyle_tensorinit_seq_end -//----------------------------------------------------------------------------// -// We introduce a RAJA View to wrap the tensor data pointer and simplify -// multi-dimensional indexing. -// We use this in the rest of the examples in this file. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // We introduce a RAJA View to wrap the tensor data pointer and simplify + // multi-dimensional indexing. + // We use this in the rest of the examples in this file. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential tensor init...\n"; -// _3D_raja_view_start - RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); -// _3D_raja_view_end - -// _cstyle_tensorinit_view_seq_start - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; + // _3D_raja_view_start + RAJA::View> aView(a, N, N, N); + // _3D_raja_view_end + + // _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_view_seq_end + // _cstyle_tensorinit_view_seq_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_seq_start + // _raja_tensorinit_seq_start using loop_policy_1 = RAJA::LoopPolicy; using launch_policy_1 = RAJA::LaunchPolicy; - RAJA::launch - (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { - - aView(i, j, k) = c * i * j * k ; - + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int j) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int i) { + aView(i, j, k) = c * i * j * k; + }); + }); }); - }); }); - }); -// _raja_tensorinit_seq_end + // _raja_tensorinit_seq_end checkResult(a, a_ref, N_tot); #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// C-style and RAJA OpenMP multithreading variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA OpenMP multithreading variants. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - // _cstyle_tensorinit_omp_outer_start - #pragma omp parallel for - for (int k = 0; k < N; ++k ) { - for (int j = 0; j < N; ++j ) { - for (int i = 0; i < N; ++i ) { - aView(i, j, k) = c * i * j * k ; +// _cstyle_tensorinit_omp_outer_start +#pragma omp parallel for + for (int k = 0; k < N; ++k) + { + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < N; ++i) + { + aView(i, j, k) = c * i * j * k; } } } -// _cstyle_tensorinit_omp_outer_end + // _cstyle_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_omp_outer_start + // _raja_tensorinit_omp_outer_start using omp_policy_2 = RAJA::LoopPolicy; using loop_policy_2 = RAJA::LoopPolicy; using launch_policy_2 = RAJA::LaunchPolicy; - RAJA::launch - (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { - - aView(i, j, k) = c * i * j * k ; - + RAJA::launch( + RAJA::LaunchParams(), // LaunchParams may be empty when running on the + // host + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int j) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int i) { + aView(i, j, k) = c * i * j * k; + }); + }); }); - }); }); - }); -// _raja_tensorinit_omp_outer_end + // _raja_tensorinit_omp_outer_end checkResult(a, a_ref, N_tot); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) // // Define total thread-block size and size of each block dimension // -// _cuda_blockdim_start + // _cuda_blockdim_start constexpr int block_size = 256; constexpr int i_block_sz = 32; constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); -// _cuda_blockdim_end + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); + // _cuda_blockdim_end -//----------------------------------------------------------------------------// -// C-style and RAJA CUDA GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style and RAJA CUDA GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_start + // _raja_tensorinit_cuda_start using cuda_teams_z_3 = RAJA::LoopPolicy; using cuda_global_thread_y_3 = RAJA::LoopPolicy; using cuda_global_thread_x_3 = RAJA::LoopPolicy; @@ -233,34 +246,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_3 = false; using launch_policy_3 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { - - aView(i, j, k) = c * i * j * k ; - - }); - }); + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int j) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int i) { + aView(i, j, k) = c * i * j * k; + }); + }); + }); }); - }); -// _raja_tensorinit_cuda_end + // _raja_tensorinit_cuda_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _raja_tensorinit_cuda_tiled_direct_start + // _raja_tensorinit_cuda_tiled_direct_start using cuda_teams_z_4 = RAJA::LoopPolicy; using cuda_teams_y_4 = RAJA::LoopPolicy; using cuda_teams_x_4 = RAJA::LoopPolicy; @@ -271,46 +284,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_4 = false; using launch_policy_4 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - - RAJA::tile - (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { - - RAJA::tile - (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { - - RAJA::loop(ctx, j_tile, [&] (int j) { - RAJA::loop(ctx, i_tile, [&] (int i) { - - aView(i, j, k) = c * i * j * k ; - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::tile( + ctx, + j_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& j_tile) { + RAJA::tile( + ctx, + i_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& i_tile) { + RAJA::loop(ctx, j_tile, [&](int j) { + RAJA::loop( + ctx, i_tile, [&](int i) { + aView(i, j, k) = c * i * j * k; + }); + }); + }); }); - }); - }); - }); - }); - }); -// _raja_tensorinit_cuda_tiled_direct_end + // _raja_tensorinit_cuda_tiled_direct_end checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); -// _cuda_tensorinit_tiled_direct_start + // _cuda_tensorinit_tiled_direct_start dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); - static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + static_assert(i_block_sz * j_block_sz * k_block_sz == block_size, "Invalid block_size"); dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), @@ -318,10 +331,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); nested_init - <<>>(a, c, N); - cudaErrchk( cudaGetLastError() ); + <<>>(a, c, N); + cudaErrchk(cudaGetLastError()); cudaErrchk(cudaDeviceSynchronize()); -// _cuda_tensorinit_tiled_direct_end + // _cuda_tensorinit_tiled_direct_end checkResult(a, a_ref, N_tot); @@ -338,27 +351,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int j_block_sz = block_size / i_block_sz; constexpr int k_block_sz = 1; - const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); - const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); - const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N, i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N, j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N, k_block_sz); -//----------------------------------------------------------------------------// -// RAJA HIP GPU variants. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP GPU variants. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - double *d_a = memoryManager::allocate_gpu(N_tot); + double* d_a = memoryManager::allocate_gpu(N_tot); -// _3D_raja_device_view_start - RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); -// _3D_raja_deviceview_end + // _3D_raja_device_view_start + RAJA::View> d_aView(d_a, N, N, N); + // _3D_raja_deviceview_end - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_start + // _raja_tensorinit_hip_start using hip_teams_z_5 = RAJA::LoopPolicy; using hip_global_thread_y_5 = RAJA::LoopPolicy; using hip_global_thread_x_5 = RAJA::LoopPolicy; @@ -366,36 +379,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_5 = false; using launch_policy_5 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { - - d_aView(i, j, k) = c * i * j * k ; - - }); - }); - }); - - }); -// _raja_tensorinit_hip_end + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int j) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int i) { + d_aView(i, j, k) = c * i * j * k; + }); + }); + }); + }); + // _raja_tensorinit_hip_end - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; // set tensor data to zero to ensure we initializing it correctly. std::memset(a, 0, N_tot * sizeof(double)); - hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice)); -// _raja_tensorinit_hip_tiled_direct_start + // _raja_tensorinit_hip_tiled_direct_start using hip_teams_z_6 = RAJA::LoopPolicy; using hip_teams_y_6 = RAJA::LoopPolicy; using hip_teams_x_6 = RAJA::LoopPolicy; @@ -406,42 +418,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const bool async_6 = false; using launch_policy_6 = RAJA::LaunchPolicy>; - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), - RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { - - RAJA::tile - (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { - - RAJA::tile - (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { - - RAJA::loop(ctx, j_tile, [&] (int j) { - RAJA::loop(ctx, i_tile, [&] (int i) { - - d_aView(i, j, k) = c * i * j * k ; - + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, N), [&](int k) { + RAJA::tile( + ctx, + j_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& j_tile) { + RAJA::tile( + ctx, + i_block_sz, + RAJA::TypedRangeSegment(0, N), + [&](RAJA::TypedRangeSegment const& i_tile) { + RAJA::loop(ctx, j_tile, [&](int j) { + RAJA::loop( + ctx, i_tile, [&](int i) { + d_aView(i, j, k) = c * i * j * k; + }); + }); + }); }); - }); - }); - }); - }); - }); -// _raja_tensorinit_hip_tiled_direct_end + // _raja_tensorinit_hip_tiled_direct_end - hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost)); checkResult(a, a_ref, N_tot); memoryManager::deallocate_gpu(d_a); #endif // if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(a); @@ -460,14 +472,18 @@ void checkResult(double* a, double* aref, const int n) bool correct = true; int i = 0; - while ( correct && (i < n) ) { + while (correct && (i < n)) + { correct = std::abs(a[i] - aref[i]) < 10e-12; i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp index 62d3d6e3e7..6f68615a45 100644 --- a/exercises/memoryManager.hpp +++ b/exercises/memoryManager.hpp @@ -28,20 +28,20 @@ namespace memoryManager { #if defined(RAJA_ENABLE_SYCL) - static camp::resources::Resource* sycl_res; +static camp::resources::Resource* sycl_res; #endif template -T *allocate(RAJA::Index_type size) +T* allocate(RAJA::Index_type size) { - T *ptr; + T* ptr; #if defined(RAJA_ENABLE_CUDA) cudaErrchk( - cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); + cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); + hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) - ptr = sycl_res->allocate(size, camp::resources::MemoryAccess::Managed); + ptr = sycl_res->allocate(size, camp::resources::MemoryAccess::Managed); #else ptr = new T[size]; #endif @@ -49,9 +49,10 @@ T *allocate(RAJA::Index_type size) } template -void deallocate(T *&ptr) +void deallocate(T*& ptr) { - if (ptr) { + if (ptr) + { #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaFree(ptr)); #elif defined(RAJA_ENABLE_HIP) @@ -65,37 +66,39 @@ void deallocate(T *&ptr) } } -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) - template - T *allocate_gpu(RAJA::Index_type size) - { - T *ptr; +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) +template +T* allocate_gpu(RAJA::Index_type size) +{ + T* ptr; #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size)); + cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); + hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) - auto qu = sycl_res->get().get_queue(); - ptr = cl::sycl::malloc_device(size, *qu); + auto qu = sycl_res->get().get_queue(); + ptr = cl::sycl::malloc_device(size, *qu); #endif - return ptr; - } + return ptr; +} - template - void deallocate_gpu(T *&ptr) +template +void deallocate_gpu(T*& ptr) +{ + if (ptr) { - if (ptr) { #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaFree(ptr)); + cudaErrchk(cudaFree(ptr)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipFree(ptr)); + hipErrchk(hipFree(ptr)); #elif defined(RAJA_ENABLE_SYCL) sycl_res->deallocate(ptr); #endif - ptr = nullptr; - } + ptr = nullptr; } +} #endif -}; // namespace memoryManager +}; // namespace memoryManager #endif diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp index 3432adbb50..478fdef1cb 100644 --- a/exercises/offset-layout-stencil.cpp +++ b/exercises/offset-layout-stencil.cpp @@ -16,21 +16,21 @@ /* * Offset Layout Stencil Exercise * - * This exercise applies a five-point stencil to the interior cells of a + * This exercise applies a five-point stencil to the interior cells of a * lattice and stores the resulting sums in a second lattice of equal size. - * You can think of the lattice as representing the centers of cells on a - * two-dimensional Cartesian mesh. + * You can think of the lattice as representing the centers of cells on a + * two-dimensional Cartesian mesh. * - * The five-point stencil accumulates values of a cell and its four neighbors. - * Assuming the cells of a lattice may be accessed through a row/col fashion, + * The five-point stencil accumulates values of a cell and its four neighbors. + * Assuming the cells of a lattice may be accessed through a row/col fashion, * the stencil may be expressed as the following sum: - * + * * output(row, col) = input(row, col) + * input(row - 1, col) + input(row + 1, col) + * input(row, col - 1) + input(row, col + 1) * * We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros - * for a lattice of size (N_r + 2) x (N_c + 2). + * for a lattice of size (N_r + 2) x (N_c + 2). * * In the case of N_r = N_c = 3, the input lattice values are: * @@ -60,8 +60,8 @@ * | 0 | 0 | 0 | 0 | 0 | * --------------------- * - * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to - * simplify the indexing to perform the stencil calculation. For the + * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to + * simplify the indexing to perform the stencil calculation. For the * purposes of discussion, we enumerate the lattice in the following manner: * * -------------------------------------------------- @@ -81,12 +81,12 @@ * * RAJA features shown: * - RAJA::kernel kernel execution method and execution policies - * - RAJA::View + * - RAJA::View * - RAJA::Layout * * For the CUDA implementation, we use unified memory to hold the lattice data. * For HIP, we use explicit host-device memory and manually copy data between - * the two. + * the two. */ /* @@ -111,28 +111,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nFive-point stencil example...\n"; -// _stencil_define_start -// -// Define num of interior cells in row/cols in a lattice -// + // _stencil_define_start + // + // Define num of interior cells in row/cols in a lattice + // constexpr int N_r = 5; constexpr int N_c = 4; -// -// Define total num of cells in rows/cols in a lattice -// + // + // Define total num of cells in rows/cols in a lattice + // constexpr int totCellsInRow = N_r + 2; constexpr int totCellsInCol = N_c + 2; -// -// Define total num of cells in a lattice -// + // + // Define total num of cells in a lattice + // constexpr int totCells = totCellsInRow * totCellsInCol; -// _stencil_define_end + // _stencil_define_end -// -// Allocate and initialize lattice -// + // + // Allocate and initialize lattice + // int* input = memoryManager::allocate(totCells * sizeof(int)); int* output = memoryManager::allocate(totCells * sizeof(int)); int* output_ref = memoryManager::allocate(totCells * sizeof(int)); @@ -141,104 +141,100 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(output, 0, totCells * sizeof(int)); std::memset(output_ref, 0, totCells * sizeof(int)); -// -// C-Style intialization -// -// _stencil_input_init_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { + // + // C-Style intialization + // + // _stencil_input_init_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { int id = col + totCellsInCol * row; input[id] = 1; } } -// _stencil_input_init_end + // _stencil_input_init_end - std::cout << "\ninput lattice:\n"; + std::cout << "\ninput lattice:\n"; printLattice(input, totCellsInRow, totCellsInCol); -// -// Generate reference solution -// -// _stencil_output_ref_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { + // + // Generate reference solution + // + // _stencil_output_ref_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { int id = col + totCellsInCol * row; - output_ref[id] = input[id] + input[id + 1] - + input[id - 1] - + input[id + totCellsInCol] - + input[id - totCellsInCol]; + output_ref[id] = input[id] + input[id + 1] + input[id - 1] + + input[id + totCellsInCol] + input[id - totCellsInCol]; } } -// _stencil_output_ref_end + // _stencil_output_ref_end - std::cout << "\noutput reference lattice:\n"; + std::cout << "\noutput reference lattice:\n"; printLattice(output_ref, totCellsInRow, totCellsInCol); -//----------------------------------------------------------------------------// - -// -// The following code illustrates pairing an offset layout and a RAJA view -// object to simplify multidimensional indexing. -// An offset layout is constructed by using the make_offset_layout method. -// The first argument of the layout is an array object with the coordinates of -// the bottom left corner of the lattice, and the second argument is an array -// object of the coordinates of the top right corner plus 1. -// The example uses double braces to initiate the array object and its -// subobjects. -// + //----------------------------------------------------------------------------// + + // + // The following code illustrates pairing an offset layout and a RAJA view + // object to simplify multidimensional indexing. + // An offset layout is constructed by using the make_offset_layout method. + // The first argument of the layout is an array object with the coordinates of + // the bottom left corner of the lattice, and the second argument is an array + // object of the coordinates of the top right corner plus 1. + // The example uses double braces to initiate the array object and its + // subobjects. + // // _offsetlayout_views_start const int DIM = 2; RAJA::OffsetLayout layout = - RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); + RAJA::make_offset_layout({{-1, -1}}, {{N_r + 1, N_c + 1}}); RAJA::View> inputView(input, layout); RAJA::View> outputView(output, layout); // _offsetlayout_views_end -// -// Create range segments used in kernels -// + // + // Create range segments used in kernels + // // _offsetlayout_ranges_start RAJA::TypedRangeSegment col_range(0, N_c); RAJA::TypedRangeSegment row_range(0, N_r); // _offsetlayout_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n"; std::memset(output, 0, totCells * sizeof(int)); // _offsetlayout_rajaseq_start - using NESTED_EXEC_POL1 = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, // row - RAJA::statement::For<0, RAJA::seq_exec, // col - RAJA::statement::Lambda<0> - > - > - >; - - RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - - outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - - }); + using NESTED_EXEC_POL1 = RAJA::KernelPolicy< + RAJA::statement::For<1, + RAJA::seq_exec, // row + RAJA::statement::For<0, + RAJA::seq_exec, // col + RAJA::statement::Lambda<0>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_range, row_range), [=](int col, int row) { + outputView(row, col) = inputView(row, col) + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); + }); // _offsetlayout_rajaseq_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -256,12 +252,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// earlier tutorial section. /// - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -270,77 +266,68 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(output, 0, totCells * sizeof(int)); // _offsetlayout_rajacuda_start - using NESTED_EXEC_POL3 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col - RAJA::statement::Lambda<0> - > - > - > - >; + using NESTED_EXEC_POL3 = RAJA::KernelPolicy>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { - outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); }); // _offsetlayout_rajacuda_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running five-point stencil (RAJA-Kernel - " "hip)...\n"; - int* d_input = memoryManager::allocate_gpu(totCells * sizeof(int)); + int* d_input = memoryManager::allocate_gpu(totCells * sizeof(int)); int* d_output = memoryManager::allocate_gpu(totCells * sizeof(int)); - hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice)); - RAJA::View> d_inputView (d_input, layout); + RAJA::View> d_inputView(d_input, layout); RAJA::View> d_outputView(d_output, layout); // _offsetlayout_rajahip_start - using NESTED_EXEC_POL4 = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::For<1, RAJA::hip_block_x_loop, //row - RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col - RAJA::statement::Lambda<0> - > - > - > - >; + using NESTED_EXEC_POL4 = RAJA::KernelPolicy>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { - d_outputView(row, col) = - d_inputView(row, col) - + d_inputView(row - 1, col) - + d_inputView(row + 1, col) - + d_inputView(row, col - 1) - + d_inputView(row, col + 1); + d_inputView(row, col) + + d_inputView(row - 1, col) + + d_inputView(row + 1, col) + + d_inputView(row, col - 1) + + d_inputView(row, col + 1); }); // _offsetlayout_rajahip_end - hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy( + output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost)); - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); @@ -348,11 +335,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_output); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(input); memoryManager::deallocate(output); memoryManager::deallocate(output_ref); @@ -367,8 +354,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void printLattice(int* lattice, int totCellsInRow, int totCellsInCol) { std::cout << std::endl; - for (int row = 0; row < totCellsInRow; ++row) { - for (int col = 0; col < totCellsInCol; ++col) { + for (int row = 0; row < totCellsInRow; ++row) + { + for (int col = 0; col < totCellsInCol; ++col) + { const int id = col + totCellsInCol * row; std::cout << lattice[id] << " "; @@ -386,14 +375,18 @@ void checkResult(int* compLattice, int* refLattice, int totCells) bool correct = true; int i = 0; - while ( correct && (i < totCells) ) { + while (correct && (i < totCells)) + { correct = (compLattice[i] == refLattice[i]); i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/offset-layout-stencil_solution.cpp b/exercises/offset-layout-stencil_solution.cpp index f212ca7630..814c6128db 100644 --- a/exercises/offset-layout-stencil_solution.cpp +++ b/exercises/offset-layout-stencil_solution.cpp @@ -16,21 +16,21 @@ /* * Offset Layout Stencil Exercise * - * This exercise applies a five-point stencil to the interior cells of a + * This exercise applies a five-point stencil to the interior cells of a * lattice and stores the resulting sums in a second lattice of equal size. - * You can think of the lattice as representing the centers of cells on a - * two-dimensional Cartesian mesh. + * You can think of the lattice as representing the centers of cells on a + * two-dimensional Cartesian mesh. * - * The five-point stencil accumulates values of a cell and its four neighbors. - * Assuming the cells of a lattice may be accessed through a row/col fashion, + * The five-point stencil accumulates values of a cell and its four neighbors. + * Assuming the cells of a lattice may be accessed through a row/col fashion, * the stencil may be expressed as the following sum: - * + * * output(row, col) = input(row, col) + * input(row - 1, col) + input(row + 1, col) + * input(row, col - 1) + input(row, col + 1) * * We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros - * for a lattice of size (N_r + 2) x (N_c + 2). + * for a lattice of size (N_r + 2) x (N_c + 2). * * In the case of N_r = N_c = 3, the input lattice values are: * @@ -60,8 +60,8 @@ * | 0 | 0 | 0 | 0 | 0 | * --------------------- * - * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to - * simplify the indexing to perform the stencil calculation. For the + * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to + * simplify the indexing to perform the stencil calculation. For the * purposes of discussion, we enumerate the lattice in the following manner: * * -------------------------------------------------- @@ -81,13 +81,13 @@ * * RAJA features shown: * - RAJA::kernel kernel execution method and execution policies - * - RAJA::View + * - RAJA::View * - RAJA::OffsetLayout * - RAJA::make_offset_layout method * * For the CUDA implementation, we use unified memory to hold the lattice data. * For HIP, we use explicit host-device memory and manually copy data between - * the two. + * the two. */ /* @@ -112,28 +112,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nFive-point stencil example...\n"; -// _stencil_define_start -// -// Define num of interior cells in row/cols in a lattice -// + // _stencil_define_start + // + // Define num of interior cells in row/cols in a lattice + // constexpr int N_r = 5; constexpr int N_c = 4; -// -// Define total num of cells in rows/cols in a lattice -// + // + // Define total num of cells in rows/cols in a lattice + // constexpr int totCellsInRow = N_r + 2; constexpr int totCellsInCol = N_c + 2; -// -// Define total num of cells in a lattice -// + // + // Define total num of cells in a lattice + // constexpr int totCells = totCellsInRow * totCellsInCol; -// _stencil_define_end + // _stencil_define_end -// -// Allocate and initialize lattice -// + // + // Allocate and initialize lattice + // int* input = memoryManager::allocate(totCells * sizeof(int)); int* output = memoryManager::allocate(totCells * sizeof(int)); int* output_ref = memoryManager::allocate(totCells * sizeof(int)); @@ -142,104 +142,100 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(output, 0, totCells * sizeof(int)); std::memset(output_ref, 0, totCells * sizeof(int)); -// -// C-Style intialization -// -// _stencil_input_init_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { + // + // C-Style intialization + // + // _stencil_input_init_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { int id = col + totCellsInCol * row; input[id] = 1; } } -// _stencil_input_init_end + // _stencil_input_init_end - std::cout << "\ninput lattice:\n"; + std::cout << "\ninput lattice:\n"; printLattice(input, totCellsInRow, totCellsInCol); -// -// Generate reference solution -// -// _stencil_output_ref_start - for (int row = 1; row <= N_r; ++row) { - for (int col = 1; col <= N_c; ++col) { + // + // Generate reference solution + // + // _stencil_output_ref_start + for (int row = 1; row <= N_r; ++row) + { + for (int col = 1; col <= N_c; ++col) + { int id = col + totCellsInCol * row; - output_ref[id] = input[id] + input[id + 1] - + input[id - 1] - + input[id + totCellsInCol] - + input[id - totCellsInCol]; + output_ref[id] = input[id] + input[id + 1] + input[id - 1] + + input[id + totCellsInCol] + input[id - totCellsInCol]; } } -// _stencil_output_ref_end + // _stencil_output_ref_end - std::cout << "\noutput reference lattice:\n"; + std::cout << "\noutput reference lattice:\n"; printLattice(output_ref, totCellsInRow, totCellsInCol); -//----------------------------------------------------------------------------// - -// -// The following code illustrates pairing an offset layout and a RAJA view -// object to simplify multidimensional indexing. -// An offset layout is constructed by using the make_offset_layout method. -// The first argument of the layout is an array object with the coordinates of -// the bottom left corner of the lattice, and the second argument is an array -// object of the coordinates of the top right corner plus 1. -// The example uses double braces to initiate the array object and its -// subobjects. -// + //----------------------------------------------------------------------------// + + // + // The following code illustrates pairing an offset layout and a RAJA view + // object to simplify multidimensional indexing. + // An offset layout is constructed by using the make_offset_layout method. + // The first argument of the layout is an array object with the coordinates of + // the bottom left corner of the lattice, and the second argument is an array + // object of the coordinates of the top right corner plus 1. + // The example uses double braces to initiate the array object and its + // subobjects. + // // _offsetlayout_views_start const int DIM = 2; RAJA::OffsetLayout layout = - RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); + RAJA::make_offset_layout({{-1, -1}}, {{N_r + 1, N_c + 1}}); RAJA::View> inputView(input, layout); RAJA::View> outputView(output, layout); // _offsetlayout_views_end -// -// Create range segments used in kernels -// + // + // Create range segments used in kernels + // // _offsetlayout_ranges_start RAJA::TypedRangeSegment col_range(0, N_c); RAJA::TypedRangeSegment row_range(0, N_r); // _offsetlayout_ranges_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n"; std::memset(output, 0, totCells * sizeof(int)); // _offsetlayout_rajaseq_start - using NESTED_EXEC_POL1 = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, // row - RAJA::statement::For<0, RAJA::seq_exec, // col - RAJA::statement::Lambda<0> - > - > - >; - - RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - - outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - - }); + using NESTED_EXEC_POL1 = RAJA::KernelPolicy< + RAJA::statement::For<1, + RAJA::seq_exec, // row + RAJA::statement::For<0, + RAJA::seq_exec, // col + RAJA::statement::Lambda<0>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_range, row_range), [=](int col, int row) { + outputView(row, col) = inputView(row, col) + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); + }); // _offsetlayout_rajaseq_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -248,33 +244,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(output, 0, totCells * sizeof(int)); // _offsetlayout_rajaomp_start - using NESTED_EXEC_POL2 = - RAJA::KernelPolicy< + using NESTED_EXEC_POL2 = RAJA::KernelPolicy< RAJA::statement::Collapse, // row, col - RAJA::statement::Lambda<0> - > - >; - - RAJA::kernel(RAJA::make_tuple(col_range, row_range), - [=](int col, int row) { - - outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - - }); + RAJA::ArgList<1, 0>, // row, col + RAJA::statement::Lambda<0>>>; + + RAJA::kernel( + RAJA::make_tuple(col_range, row_range), [=](int col, int row) { + outputView(row, col) = inputView(row, col) + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); + }); // _offsetlayout_rajaomp_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -283,36 +272,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(output, 0, totCells * sizeof(int)); // _offsetlayout_rajacuda_start - using NESTED_EXEC_POL3 = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col - RAJA::statement::Lambda<0> - > - > - > - >; + using NESTED_EXEC_POL3 = RAJA::KernelPolicy>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { - outputView(row, col) = - inputView(row, col) - + inputView(row - 1, col) - + inputView(row + 1, col) - + inputView(row, col - 1) - + inputView(row, col + 1); - + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); }); // _offsetlayout_rajacuda_end - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) @@ -321,42 +304,40 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(output, 0, totCells * sizeof(int)); - int* d_input = memoryManager::allocate_gpu(totCells); + int* d_input = memoryManager::allocate_gpu(totCells); int* d_output = memoryManager::allocate_gpu(totCells); - hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy( + d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice)); - RAJA::View> d_inputView (d_input, layout); + RAJA::View> d_inputView(d_input, layout); RAJA::View> d_outputView(d_output, layout); // _offsetlayout_rajahip_start - using NESTED_EXEC_POL4 = - RAJA::KernelPolicy< - RAJA::statement::HipKernel< - RAJA::statement::For<1, RAJA::hip_block_x_loop, //row - RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col - RAJA::statement::Lambda<0> - > - > - > - >; + using NESTED_EXEC_POL4 = RAJA::KernelPolicy>>>>; RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { - d_outputView(row, col) = - d_inputView(row, col) - + d_inputView(row - 1, col) - + d_inputView(row + 1, col) - + d_inputView(row, col - 1) - + d_inputView(row, col + 1); + d_inputView(row, col) + + d_inputView(row - 1, col) + + d_inputView(row + 1, col) + + d_inputView(row, col - 1) + + d_inputView(row, col + 1); }); // _offsetlayout_rajahip_end - hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy( + output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost)); - std::cout << "\noutput lattice:\n"; + std::cout << "\noutput lattice:\n"; printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); @@ -364,11 +345,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_output); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(input); memoryManager::deallocate(output); memoryManager::deallocate(output_ref); @@ -383,8 +364,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) void printLattice(int* lattice, int totCellsInRow, int totCellsInCol) { std::cout << std::endl; - for (int row = 0; row < totCellsInRow; ++row) { - for (int col = 0; col < totCellsInCol; ++col) { + for (int row = 0; row < totCellsInRow; ++row) + { + for (int col = 0; col < totCellsInCol; ++col) + { const int id = col + totCellsInCol * row; std::cout << lattice[id] << " "; @@ -402,14 +385,18 @@ void checkResult(int* compLattice, int* refLattice, int totCells) bool correct = true; int i = 0; - while ( correct && (i < totCells) ) { + while (correct && (i < totCells)) + { correct = (compLattice[i] == refLattice[i]); i++; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp index 2fb9d7ac56..b789e63690 100644 --- a/exercises/permuted-layout-batch-matrix-multiply.cpp +++ b/exercises/permuted-layout-batch-matrix-multiply.cpp @@ -75,77 +75,77 @@ constexpr int HIP_BLOCK_SIZE = 256; #endif // -//Function for checking results +// Function for checking results // template void checkResult(T C, int nMat, int nRows, int nCols); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA batched matrix multiplication exercise...\n"; -// Dimensions of matrices + // Dimensions of matrices constexpr int N_c = 3; constexpr int N_r = 3; -// Number of matrices + // Number of matrices constexpr int N = 8000000; -// Number of iterations + // Number of iterations constexpr int NITER = 20; std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n"; -// -// Initialize a RAJA timer object -// and variable to store minimum run time -// + // + // Initialize a RAJA timer object + // and variable to store minimum run time + // auto timer = RAJA::Timer(); double minRun = std::numeric_limits::max(); -// -// Allocate space for data in layout 1 -// - double *A = memoryManager::allocate(N_c * N_r * N); - double *B = memoryManager::allocate(N_c * N_r * N); - double *C = memoryManager::allocate(N_c * N_r * N); - -// -// Layout 1 -// -// make_permuted_layout takes the number of entries in each dimension and a -// templated array indicating index arguments with slowest to fastest stride. -// Standard C++ arrays are used to hold the number of entries in each component. -// This example uses double braces to initalize the array and its subobjects. -// The layout object will index into the array as the following C macro would -// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. -// -// RAJA::Layout objects may be templated on dimension, argument type, and -// index with unit stride. Here, the column index has unit stride (argument 2). -// + // + // Allocate space for data in layout 1 + // + double* A = memoryManager::allocate(N_c * N_r * N); + double* B = memoryManager::allocate(N_c * N_r * N); + double* C = memoryManager::allocate(N_c * N_r * N); + + // + // Layout 1 + // + // make_permuted_layout takes the number of entries in each dimension and a + // templated array indicating index arguments with slowest to fastest stride. + // Standard C++ arrays are used to hold the number of entries in each + // component. This example uses double braces to initalize the array and its + // subobjects. The layout object will index into the array as the following C + // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. + // + // RAJA::Layout objects may be templated on dimension, argument type, and + // index with unit stride. Here, the column index has unit stride (argument + // 2). + // // _permutedlayout_defviews_start - std::array perm1 {{0, 1, 2}}; - auto layout1 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 ); + std::array perm1{{0, 1, 2}}; + auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1); RAJA::View> Aview(A, layout1); RAJA::View> Bview(B, layout1); RAJA::View> Cview(C, layout1); // _permutedlayout_defviews_end -// -// Allocate space for data in layout 2 -// - double *A2 = memoryManager::allocate(N_c * N_r * N); - double *B2 = memoryManager::allocate(N_c * N_r * N); - double *C2 = memoryManager::allocate(N_c * N_r * N); + // + // Allocate space for data in layout 2 + // + double* A2 = memoryManager::allocate(N_c * N_r * N); + double* B2 = memoryManager::allocate(N_c * N_r * N); + double* C2 = memoryManager::allocate(N_c * N_r * N); -// -// Permuted layout - equivalent to indexing using the following macro -// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] -// In this case the element index has unit stride (argument 0). -// + // + // Permuted layout - equivalent to indexing using the following macro + // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] + // In this case the element index has unit stride (argument 0). + // /// /// TODO... @@ -158,13 +158,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// Then, create views for the A2, B2, C2 arrays using the /// layout object; i.e., Aview2, Bview2, and Cview2. /// - /// Hint: You will the same indexing to access the array data - /// via the Views as for the Views above which are created + /// Hint: You will the same indexing to access the array data + /// via the Views as for the Views above which are created /// using the layout1 View (see kernels in the code below). /// - /// When you are done with the Views, test them out by + /// When you are done with the Views, test them out by /// uncommenting the kernels in the code below that use the - /// the Aview2, Bview2, and Cview2 views. + /// the Aview2, Bview2, and Cview2 views. /// // @@ -180,64 +180,65 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(e, row, col) = row; Bview(e, row, col) = col; Cview(e, row, col) = 0; -// Aview2(e, row, col) = row; -// Bview2(e, row, col) = col; -// Cview2(e, row, col) = 0; + // Aview2(e, row, col) = row; + // Bview2(e, row, col) = col; + // Cview2(e, row, col) = 0; } } }); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - sequential) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_loop_start - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - } - ); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), [=](int e) { + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + }); // _permutedlayout_batchedmatmult_loop_end timer.stop(); @@ -245,68 +246,68 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) if (tMin < minRun) minRun = tMin; timer.reset(); } - + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - sequential) ... " << std::endl; -/* - timer.start(); - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - // _permutedlayout2_batchedmatmult_loop_start - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); + /* + timer.start(); + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + // _permutedlayout2_batchedmatmult_loop_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); - } - ); - // _permutedlayout2_batchedmatmult_loop_end - timer.stop(); + } + ); + // _permutedlayout2_batchedmatmult_loop_end + timer.stop(); - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); -*/ + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + */ -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -316,45 +317,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_omp_start - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - - } - ); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), [=](int e) { + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + }); // _permutedlayout_batchedmatmult_omp_end timer.stop(); @@ -362,71 +361,72 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) if (tMin < minRun) minRun = tMin; timer.reset(); } - - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - omp parallel for) ... " << std::endl; std::memset(C2, 0, N_c * N_r * N * sizeof(double)); -/* - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); + /* + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall(RAJA::TypedRangeSegment(0, + N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - } - ); - timer.stop(); + } + ); + timer.stop(); - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); -*/ + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -436,44 +436,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE(int e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - - } - ); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + }); timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -481,10 +479,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); } - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - cuda) ... " << std::endl; @@ -496,7 +494,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < NITER; ++i) { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), + RAJA::forall>(RAJA::TypedRangeSegment(0, + N), [=] RAJA_DEVICE(int e) { Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) @@ -542,63 +541,63 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - hip) ... " << std::endl; - double *d_A = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_A = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_B = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_C = memoryManager::allocate_gpu(N_c * N_r * N); RAJA::View> d_Aview(d_A, layout1); RAJA::View> d_Bview(d_B, layout1); RAJA::View> d_Cview(d_C, layout1); - hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE(int e) { - - d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 0); - d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 1); - d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 2); - - d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 0); - d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 1); - d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 2); - - d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 0); - d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 1); - d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 2); - - } - ); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { + d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 0); + d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 1); + d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 0); + d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 1); + d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 0); + d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 1); + d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 2); + }); timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -606,19 +605,20 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); } - hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost)); - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate_gpu(d_A); memoryManager::deallocate_gpu(d_B); memoryManager::deallocate_gpu(d_C); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - hip) ... " << std::endl; @@ -632,14 +632,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Bview2(d_B2, layout2); RAJA::View> d_Cview2(d_C2, layout2); - hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), +hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * +sizeof(double), hipMemcpyHostToDevice )); minRun = std::numeric_limits::max(); for (int i = 0; i < NITER; ++i) { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), + RAJA::forall>(RAJA::TypedRangeSegment(0, +N), [=] RAJA_DEVICE(int e) { d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) @@ -681,7 +683,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); } - hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), +hipMemcpyDeviceToHost )); std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); @@ -695,11 +698,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); @@ -719,19 +722,26 @@ void checkResult(T C, int nMat, int nRows, int nCols) { bool status = true; - for (int e = 0; e < nMat; ++e) { - for (int row = 0; row < nRows; ++row) { - for (int col = 0; col < nCols; ++col) { - if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) { + for (int e = 0; e < nMat; ++e) + { + for (int row = 0; row < nRows; ++row) + { + for (int col = 0; col < nCols; ++col) + { + if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) + { status = false; } } } } - if ( status ) { + if (status) + { std::cout << "\tresult -- PASS\n"; - } else { + } + else + { std::cout << "\tresult -- FAIL\n"; } } diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp index 297ec45047..05b393ef2b 100644 --- a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp +++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp @@ -76,81 +76,80 @@ constexpr int HIP_BLOCK_SIZE = 256; #endif // -//Function for checking results +// Function for checking results // template void checkResult(T C, int nMat, int nRows, int nCols); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA batched matrix multiplication exercise...\n"; -// Dimensions of matrices + // Dimensions of matrices constexpr int N_c = 3; constexpr int N_r = 3; -// Number of matrices + // Number of matrices constexpr int N = 8000000; -// Number of iterations + // Number of iterations constexpr int NITER = 20; std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n"; -// -// Initialize a RAJA timer object -// and variable to store minimum run time -// + // + // Initialize a RAJA timer object + // and variable to store minimum run time + // auto timer = RAJA::Timer(); double minRun = std::numeric_limits::max(); -// -// Allocate space for data in layout 1 -// - double *A = memoryManager::allocate(N_c * N_r * N); - double *B = memoryManager::allocate(N_c * N_r * N); - double *C = memoryManager::allocate(N_c * N_r * N); - -// -// Layout 1 -// -// make_permuted_layout takes the number of entries in each dimension and a -// templated array indicating index arguments with slowest to fastest stride. -// Standard C++ arrays are used to hold the number of entries in each component. -// This example uses double braces to initalize the array and its subobjects. -// The layout object will index into the array as the following C macro would -// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. -// -// RAJA::Layout objects may be templated on dimension, argument type, and -// index with unit stride. Here, the column index has unit stride (argument 2). -// + // + // Allocate space for data in layout 1 + // + double* A = memoryManager::allocate(N_c * N_r * N); + double* B = memoryManager::allocate(N_c * N_r * N); + double* C = memoryManager::allocate(N_c * N_r * N); + + // + // Layout 1 + // + // make_permuted_layout takes the number of entries in each dimension and a + // templated array indicating index arguments with slowest to fastest stride. + // Standard C++ arrays are used to hold the number of entries in each + // component. This example uses double braces to initalize the array and its + // subobjects. The layout object will index into the array as the following C + // macro would #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. + // + // RAJA::Layout objects may be templated on dimension, argument type, and + // index with unit stride. Here, the column index has unit stride (argument + // 2). + // // _permutedlayout_defviews_start - std::array perm1 {{0, 1, 2}}; - auto layout1 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 ); + std::array perm1{{0, 1, 2}}; + auto layout1 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm1); RAJA::View> Aview(A, layout1); RAJA::View> Bview(B, layout1); RAJA::View> Cview(C, layout1); // _permutedlayout_defviews_end -// -// Allocate space for data in layout 2 -// - double *A2 = memoryManager::allocate(N_c * N_r * N); - double *B2 = memoryManager::allocate(N_c * N_r * N); - double *C2 = memoryManager::allocate(N_c * N_r * N); - -// -// Permuted layout - equivalent to indexing using the following macro -// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] -// In this case the element index has unit stride (argument 0). -// + // + // Allocate space for data in layout 2 + // + double* A2 = memoryManager::allocate(N_c * N_r * N); + double* B2 = memoryManager::allocate(N_c * N_r * N); + double* C2 = memoryManager::allocate(N_c * N_r * N); + + // + // Permuted layout - equivalent to indexing using the following macro + // #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] + // In this case the element index has unit stride (argument 0). + // // _permutedlayout_permviews_start - std::array perm2 {{1, 2, 0}}; - auto layout2 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 ); + std::array perm2{{1, 2, 0}}; + auto layout2 = RAJA::make_permuted_layout({{N, N_r, N_c}}, perm2); RAJA::View> Aview2(A2, layout2); RAJA::View> Bview2(B2, layout2); @@ -170,8 +169,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(e, row, col) = row; Bview(e, row, col) = col; Cview(e, row, col) = 0; @@ -184,50 +185,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - sequential) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_loop_start - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - } - ); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), [=](int e) { + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + }); // _permutedlayout_batchedmatmult_loop_end timer.stop(); @@ -235,55 +235,53 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) if (tMin < minRun) minRun = tMin; timer.reset(); } - + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - sequential) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout2_batchedmatmult_loop_start - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - } - ); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), [=](int e) { + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + }); // _permutedlayout2_batchedmatmult_loop_end timer.stop(); @@ -291,10 +289,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -304,45 +302,43 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); // _permutedlayout_batchedmatmult_omp_start - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - - } - ); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), [=](int e) { + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + }); // _permutedlayout_batchedmatmult_omp_end timer.stop(); @@ -350,11 +346,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) if (tMin < minRun) minRun = tMin; timer.reset(); } - - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - omp parallel for) ... " << std::endl; @@ -362,57 +358,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C2, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](int e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - } - ); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), [=](int e) { + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + }); timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -422,44 +416,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE(int e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - - } - ); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + }); timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -467,10 +459,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); } - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - cuda) ... " << std::endl; @@ -478,68 +470,66 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C2, 0, N_c * N_r * N * sizeof(double)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE(int e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - } - ); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + }); timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); if (tMin < minRun) minRun = tMin; timer.reset(); } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << " \n Running batched matrix multiplication" << " with layout 1 (RAJA - hip) ... " << std::endl; - double *d_A = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_A = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_B = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_C = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_A2 = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B2 = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C2 = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_A2 = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_B2 = memoryManager::allocate_gpu(N_c * N_r * N); + double* d_C2 = memoryManager::allocate_gpu(N_c * N_r * N); RAJA::View> d_Aview(d_A, layout1); RAJA::View> d_Bview(d_B, layout1); @@ -549,50 +539,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_Bview2(d_B2, layout2); RAJA::View> d_Cview2(d_C2, layout2); - hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk( + hipMemcpy(d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy( + d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy( + d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice)); minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE(int e) { - - d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 0); - d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 1); - d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 2); - - d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 0); - d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 1); - d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 2); - - d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 0); - d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 1); - d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 2); - - } - ); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { + d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 0); + d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 1); + d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 0); + d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 1); + d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 0); + d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 1); + d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 2); + }); timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -600,55 +592,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); } - hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk( + hipMemcpy(C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost)); - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + std::cout << "\trun time: " << minRun << " seconds" << std::endl; checkResult(Cview, N, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << " \n Running batched matrix multiplication" << " with layout 2 (RAJA - hip) ... " << std::endl; minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { + for (int i = 0; i < NITER; ++i) + { timer.start(); - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE(int e) { - - d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) - + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) - + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0); - d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1) - + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1) - + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1); - d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2) - + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2) - + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2); - - d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0) - + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0) - + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0); - d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1) - + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1) - + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1); - d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2) - + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2) - + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2); - - d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0) - + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0) - + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0); - d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1) - + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1) - + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1); - d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2) - + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2) - + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2); - - } - ); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), [=] RAJA_DEVICE(int e) { + d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2); + + d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2); + + d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2); + }); timer.stop(); RAJA::Timer::ElapsedType tMin = timer.elapsed(); @@ -656,9 +647,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.reset(); } - hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy( + C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost)); - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + std::cout << "\trun time : " << minRun << " seconds" << std::endl; checkResult(Cview2, N, N_r, N_c); memoryManager::deallocate_gpu(d_A); @@ -669,11 +661,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) memoryManager::deallocate_gpu(d_C2); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(A); memoryManager::deallocate(B); memoryManager::deallocate(C); @@ -693,19 +685,26 @@ void checkResult(T C, int nMat, int nRows, int nCols) { bool status = true; - for (int e = 0; e < nMat; ++e) { - for (int row = 0; row < nRows; ++row) { - for (int col = 0; col < nCols; ++col) { - if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) { + for (int e = 0; e < nMat; ++e) + { + for (int row = 0; row < nRows; ++row) + { + for (int col = 0; col < nCols; ++col) + { + if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) + { status = false; } } } } - if ( status ) { + if (status) + { std::cout << "\tresult -- PASS\n"; - } else { + } + else + { std::cout << "\tresult -- FAIL\n"; } } diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp index 4c6b90c063..e4752861de 100644 --- a/exercises/reductions.cpp +++ b/exercises/reductions.cpp @@ -32,7 +32,7 @@ Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//constexpr int CUDA_BLOCK_SIZE = 256; +// constexpr int CUDA_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_HIP) @@ -45,27 +45,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { - a[i] = -1; + } + else + { + a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; a[minloc_ref] = -100; @@ -73,26 +77,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start -//RAJA::TypedRangeSegment arange(0, N); - // _reductions_range_end + // RAJA::TypedRangeSegment arange(0, N); + // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; @@ -101,7 +105,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Define EXEC_POL1 and REDCUE_POL1 for executing sequentially. /// - + /// TODO... /// /// EXERCISE: Remove comments for remainder of sequential section. @@ -112,11 +116,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::ReduceSum seq_sum(0); RAJA::ReduceMin seq_min(std::numeric_limits::max()); RAJA::ReduceMax seq_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc seq_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc seq_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMinLoc + seq_minloc(std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc + seq_maxloc(std::numeric_limits::min(), -1); RAJA::forall(arange, [=](int i) { - + seq_sum += a[i]; seq_min.min(a[i]); @@ -130,14 +136,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tsum = " << seq_sum.get() << std::endl; std::cout << "\tmin = " << seq_min.get() << std::endl; std::cout << "\tmax = " << seq_max.get() << std::endl; - std::cout << "\tmin, loc = " << seq_minloc.get() << " , " + std::cout << "\tmin, loc = " << seq_minloc.get() << " , " << seq_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " + std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " << seq_maxloc.getLoc() << std::endl; */ - -//----------------------------------------------------------------------------// + + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; @@ -152,7 +158,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise. + /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this + /// exercise. /// /// Uncomment 'arange' variable above so it can be used in kernel. /// @@ -181,12 +188,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tmin, loc = " << omp_minloc.get() << " , " << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.get() << " , " - << omp_maxloc.getLoc() << std::endl; + << omp_maxloc.getLoc() << std::endl; */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; @@ -200,7 +207,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise. + /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this + /// exercise. /// /// Uncomment 'arange' variable above so it can be used in kernel. /// @@ -232,7 +240,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) */ #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; @@ -240,21 +248,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::TypedRangeSegment arange1(0, N); int* d_a = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); // _reductions_raja_hippolicy_start - using EXEC_POL3 = RAJA::hip_exec; + using EXEC_POL3 = RAJA::hip_exec; using REDUCE_POL3 = RAJA::hip_reduce; // _reductions_raja_hippolicy_end RAJA::ReduceSum hip_sum(0); RAJA::ReduceMin hip_min(std::numeric_limits::max()); RAJA::ReduceMax hip_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc hip_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc hip_maxloc(std::numeric_limits::min(), -1); - - RAJA::forall(arange1, [=] RAJA_DEVICE (int i) { + RAJA::ReduceMinLoc hip_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc hip_maxloc( + std::numeric_limits::min(), -1); + RAJA::forall(arange1, [=] RAJA_DEVICE(int i) { hip_sum += d_a[i]; hip_min.min(d_a[i]); @@ -262,28 +271,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hip_minloc.minloc(d_a[i], i); hip_maxloc.maxloc(d_a[i], i); - }); std::cout << "\tsum = " << hip_sum.get() << std::endl; std::cout << "\tmin = " << hip_min.get() << std::endl; std::cout << "\tmax = " << hip_max.get() << std::endl; std::cout << "\tmin, loc = " << hip_minloc.get() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.get() << " , " - << hip_maxloc.getLoc() << std::endl; + << hip_maxloc.getLoc() << std::endl; memoryManager::deallocate_gpu(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; - + return 0; } diff --git a/exercises/reductions_solution.cpp b/exercises/reductions_solution.cpp index 6da731e62e..46992ec857 100644 --- a/exercises/reductions_solution.cpp +++ b/exercises/reductions_solution.cpp @@ -45,27 +45,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA reductions example...\n"; // _reductions_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 1000000; -// -// Allocate array data and initialize data to alternating sequence of 1, -1. -// + // + // Allocate array data and initialize data to alternating sequence of 1, -1. + // int* a = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - if ( i % 2 == 0 ) { + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { a[i] = 1; - } else { - a[i] = -1; + } + else + { + a[i] = -1; } } -// -// Set min and max loc values -// + // + // Set min and max loc values + // constexpr int minloc_ref = N / 2; a[minloc_ref] = -100; @@ -73,41 +77,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) a[maxloc_ref] = 100; // _reductions_array_init_end -// -// Note: with this data initialization scheme, the following results will -// be observed for all reduction kernels below: -// -// - the sum will be zero -// - the min will be -100 -// - the max will be 100 -// - the min loc will be N/2 -// - the max loc will be N/2 + 1 -// -// - -// -// Define index range for iterating over a elements in all examples -// + // + // Note: with this data initialization scheme, the following results will + // be observed for all reduction kernels below: + // + // - the sum will be zero + // - the min will be -100 + // - the max will be 100 + // - the min loc will be N/2 + // - the max loc will be N/2 + 1 + // + // + + // + // Define index range for iterating over a elements in all examples + // // _reductions_range_start RAJA::TypedRangeSegment arange(0, N); // _reductions_range_end -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; // _reductions_raja_seq_start - using EXEC_POL1 = RAJA::seq_exec; + using EXEC_POL1 = RAJA::seq_exec; using REDUCE_POL1 = RAJA::seq_reduce; - + RAJA::ReduceSum seq_sum(0); RAJA::ReduceMin seq_min(std::numeric_limits::max()); RAJA::ReduceMax seq_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc seq_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc seq_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMinLoc seq_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc seq_maxloc( + std::numeric_limits::min(), -1); RAJA::forall(arange, [=](int i) { - seq_sum += a[i]; seq_min.min(a[i]); @@ -115,37 +120,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) seq_minloc.minloc(a[i], i); seq_maxloc.maxloc(a[i], i); - }); std::cout << "\tsum = " << seq_sum.get() << std::endl; std::cout << "\tmin = " << seq_min.get() << std::endl; std::cout << "\tmax = " << seq_max.get() << std::endl; - std::cout << "\tmin, loc = " << seq_minloc.get() << " , " - << seq_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " - << seq_maxloc.getLoc() << std::endl; + std::cout << "\tmin, loc = " << seq_minloc.get() << " , " + << seq_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " + << seq_maxloc.getLoc() << std::endl; // _reductions_raja_seq_end - -//----------------------------------------------------------------------------// + + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP reductions...\n"; // _reductions_raja_omppolicy_start - using EXEC_POL2 = RAJA::omp_parallel_for_exec; + using EXEC_POL2 = RAJA::omp_parallel_for_exec; using REDUCE_POL2 = RAJA::omp_reduce; // _reductions_raja_omppolicy_end RAJA::ReduceSum omp_sum(0); RAJA::ReduceMin omp_min(std::numeric_limits::max()); RAJA::ReduceMax omp_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc omp_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc omp_maxloc(std::numeric_limits::min(), -1); + RAJA::ReduceMinLoc omp_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc omp_maxloc( + std::numeric_limits::min(), -1); RAJA::forall(arange, [=](int i) { - omp_sum += a[i]; omp_min.min(a[i]); @@ -153,37 +158,37 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) omp_minloc.minloc(a[i], i); omp_maxloc.maxloc(a[i], i); - }); std::cout << "\tsum = " << omp_sum.get() << std::endl; std::cout << "\tmin = " << omp_min.get() << std::endl; std::cout << "\tmax = " << omp_max.get() << std::endl; std::cout << "\tmin, loc = " << omp_minloc.get() << " , " - << omp_minloc.getLoc() << std::endl; + << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.get() << " , " - << omp_maxloc.getLoc() << std::endl; + << omp_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; // _reductions_raja_cudapolicy_start - using EXEC_POL3 = RAJA::cuda_exec; + using EXEC_POL3 = RAJA::cuda_exec; using REDUCE_POL3 = RAJA::cuda_reduce; // _reductions_raja_cudapolicy_end RAJA::ReduceSum cuda_sum(0); RAJA::ReduceMin cuda_min(std::numeric_limits::max()); RAJA::ReduceMax cuda_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc cuda_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc cuda_maxloc(std::numeric_limits::min(), -1); - - RAJA::forall(arange, [=] RAJA_DEVICE (int i) { + RAJA::ReduceMinLoc cuda_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc cuda_maxloc( + std::numeric_limits::min(), -1); + RAJA::forall(arange, [=] RAJA_DEVICE(int i) { cuda_sum += a[i]; cuda_min.min(a[i]); @@ -191,39 +196,39 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) cuda_minloc.minloc(a[i], i); cuda_maxloc.maxloc(a[i], i); - }); std::cout << "\tsum = " << cuda_sum.get() << std::endl; std::cout << "\tmin = " << cuda_min.get() << std::endl; std::cout << "\tmax = " << cuda_max.get() << std::endl; std::cout << "\tmin, loc = " << cuda_minloc.get() << " , " - << cuda_minloc.getLoc() << std::endl; + << cuda_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << cuda_maxloc.get() << " , " - << cuda_maxloc.getLoc() << std::endl; + << cuda_maxloc.getLoc() << std::endl; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; int* d_a = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); // _reductions_raja_hippolicy_start - using EXEC_POL3 = RAJA::hip_exec; + using EXEC_POL3 = RAJA::hip_exec; using REDUCE_POL3 = RAJA::hip_reduce; // _reductions_raja_hippolicy_end RAJA::ReduceSum hip_sum(0); RAJA::ReduceMin hip_min(std::numeric_limits::max()); RAJA::ReduceMax hip_max(std::numeric_limits::min()); - RAJA::ReduceMinLoc hip_minloc(std::numeric_limits::max(), -1); - RAJA::ReduceMaxLoc hip_maxloc(std::numeric_limits::min(), -1); - - RAJA::forall(arange, [=] RAJA_DEVICE (int i) { + RAJA::ReduceMinLoc hip_minloc( + std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc hip_maxloc( + std::numeric_limits::min(), -1); + RAJA::forall(arange, [=] RAJA_DEVICE(int i) { hip_sum += d_a[i]; hip_min.min(d_a[i]); @@ -231,28 +236,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hip_minloc.minloc(d_a[i], i); hip_maxloc.maxloc(d_a[i], i); - }); std::cout << "\tsum = " << hip_sum.get() << std::endl; std::cout << "\tmin = " << hip_min.get() << std::endl; std::cout << "\tmax = " << hip_max.get() << std::endl; std::cout << "\tmin, loc = " << hip_minloc.get() << " , " - << hip_minloc.getLoc() << std::endl; + << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.get() << " , " - << hip_maxloc.getLoc() << std::endl; + << hip_maxloc.getLoc() << std::endl; memoryManager::deallocate_gpu(d_a); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); std::cout << "\n DONE!...\n"; - + return 0; } diff --git a/exercises/scan.cpp b/exercises/scan.cpp index 68f52fce2b..11e3068ff8 100644 --- a/exercises/scan.cpp +++ b/exercises/scan.cpp @@ -40,11 +40,11 @@ Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//constexpr int CUDA_BLOCK_SIZE = 16; +// constexpr int CUDA_BLOCK_SIZE = 16; #endif #if defined(RAJA_ENABLE_HIP) -//constexpr int HIP_BLOCK_SIZE = 16; +// constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -66,14 +66,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA scan example...\n"; // _scan_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// + // + // Allocate and initialize vector data + // int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); @@ -85,11 +85,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_array_init_end - -//----------------------------------------------------------------------------// -// Perform various sequential scans to illustrate inclusive/exclusive, -// in-place, default scans with different operators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential scans to illustrate inclusive/exclusive, + // in-place, default scans with different operators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (default)...\n"; @@ -97,7 +96,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec - /// execution policy type. + /// execution policy type. /// /// NOTE: We've done this one for you to help you get started... /// @@ -111,7 +110,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (plus)...\n"; @@ -121,14 +120,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan (plus)...\n"; @@ -138,14 +137,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an exclusive RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n"; @@ -155,14 +154,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit minimum operator. + /// execution policy type and an explicit minimum operator. /// CHECK_INC_SCAN_RESULTS(OP_MIN_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n"; @@ -172,7 +171,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::seq_exec - /// execution policy type and an explicit maximum operator. + /// execution policy type and an explicit maximum operator. /// CHECK_EXC_SCAN_RESULTS(OP_MAX_INT) @@ -182,24 +181,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP inclusive_scan (plus)...\n"; /// /// TODO... /// - /// EXERCISE: Implement an inclusive RAJA scan with RAJA::omp_parallel_for_exec - /// execution policy type and an explicit plus operator. + /// EXERCISE: Implement an inclusive RAJA scan with + /// RAJA::omp_parallel_for_exec + /// execution policy type and an explicit plus operator. /// CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n"; @@ -208,8 +208,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::omp_parallel_for_exec - /// execution policy type and an explicit plus operator. + /// EXERCISE: Implement an exclusive inplace RAJA scan with + /// RAJA::omp_parallel_for_exec + /// execution policy type and an explicit plus operator. /// CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) @@ -218,13 +219,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a couple of CUDA scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of CUDA scans... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n"; @@ -244,7 +245,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n"; @@ -264,7 +265,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan (plus)...\n"; @@ -286,14 +287,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n"; @@ -301,42 +302,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* d_in = memoryManager::allocate_gpu(N); int* d_out = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::hip_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top /// of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP exclusive_scan (plus)...\n"; - hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement an exclusive RAJA scan with RAJA::hip_exec - /// execution policy type and an explicit plus operator. + /// execution policy type and an explicit plus operator. /// /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top /// of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -347,11 +348,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); @@ -368,12 +369,14 @@ template void checkInclusiveScanResult(const T* in, const T* out, int N) { T val = Function::identity(); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { val = Function()(val, in[i]); - if (out[i] != val) { + if (out[i] != val) + { std::cout << "\n\t result -- WRONG\n"; - std::cout << "\t" << out[i] << " != " << val - << " (at index " << i << ")\n"; + std::cout << "\t" << out[i] << " != " << val << " (at index " << i + << ")\n"; } } std::cout << "\n\t result -- CORRECT\n"; @@ -386,11 +389,13 @@ template void checkExclusiveScanResult(const T* in, const T* out, int N) { T val = Function::identity(); - for (int i = 0; i < N; ++i) { - if (out[i] != val) { + for (int i = 0; i < N; ++i) + { + if (out[i] != val) + { std::cout << "\n\t result -- WRONG\n"; - std::cout << "\t" << out[i] << " != " << val - << " (at index " << i << ")\n"; + std::cout << "\t" << out[i] << " != " << val << " (at index " << i + << ")\n"; } val = Function()(val, in[i]); } @@ -404,6 +409,9 @@ template void printArray(const T* v, int N) { std::cout << std::endl; - for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; } + for (int i = 0; i < N; ++i) + { + std::cout << " " << v[i]; + } std::cout << std::endl; } diff --git a/exercises/scan_solution.cpp b/exercises/scan_solution.cpp index 7ed7101192..925b586101 100644 --- a/exercises/scan_solution.cpp +++ b/exercises/scan_solution.cpp @@ -40,11 +40,11 @@ Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) - constexpr int CUDA_BLOCK_SIZE = 16; +constexpr int CUDA_BLOCK_SIZE = 16; #endif #if defined(RAJA_ENABLE_HIP) - constexpr int HIP_BLOCK_SIZE = 16; +constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -66,14 +66,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA scan example...\n"; // _scan_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// + // + // Allocate and initialize vector data + // int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); @@ -85,11 +85,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_array_init_end - -//----------------------------------------------------------------------------// -// Perform various sequential scans to illustrate inclusive/exclusive, -// in-place, default scans with different operators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential scans to illustrate inclusive/exclusive, + // in-place, default scans with different operators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (default)...\n"; @@ -102,7 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan (plus)...\n"; @@ -118,7 +117,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan (plus)...\n"; @@ -134,7 +133,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n"; @@ -149,7 +148,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n"; @@ -167,23 +166,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP inclusive_scan (plus)...\n"; // _scan_inclusive_omp_plus_start - RAJA::inclusive_scan(RAJA::make_span(in, N), - RAJA::make_span(out, N), - RAJA::operators::plus{}); + RAJA::inclusive_scan( + RAJA::make_span(in, N), + RAJA::make_span(out, N), + RAJA::operators::plus{}); // _scan_inclusive_omp_plus_end CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n"; @@ -191,8 +191,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_exclusive_inplace_omp_plus_start RAJA::exclusive_scan_inplace( - RAJA::make_span(out, N), - RAJA::operators::plus{}); + RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_inplace_omp_plus_end CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) @@ -201,13 +200,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a few CUDA scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a few CUDA scans... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n"; @@ -215,15 +214,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_inclusive_inplace_cuda_plus_start RAJA::inclusive_scan_inplace>( - RAJA::make_span(out, N), - RAJA::operators::plus{}); + RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_inclusive_inplace_cuda_plus_end CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n"; @@ -231,15 +229,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _scan_exclusive_inplace_cuda_plus_start RAJA::exclusive_scan_inplace>( - RAJA::make_span(out, N), - RAJA::operators::plus{}); + RAJA::make_span(out, N), RAJA::operators::plus{}); // _scan_exclusive_inplace_cuda_plus_end CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA exclusive_scan (plus)...\n"; @@ -258,14 +255,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP scans... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP scans... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n"; @@ -273,33 +270,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* d_in = memoryManager::allocate_gpu(N); int* d_out = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); // _scan_inclusive_inplace_hip_plus_start RAJA::inclusive_scan_inplace>( - RAJA::make_span(d_out, N), - RAJA::operators::plus{}); + RAJA::make_span(d_out, N), RAJA::operators::plus{}); // _scan_inclusive_inplace_hip_plus_end - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP exclusive_scan (plus)...\n"; - hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_in, in, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); RAJA::exclusive_scan>( RAJA::make_span(d_in, N), RAJA::make_span(d_out, N), RAJA::operators::plus{}); - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); @@ -310,11 +306,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); @@ -331,12 +327,14 @@ template void checkInclusiveScanResult(const T* in, const T* out, int N) { T val = Function::identity(); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { val = Function()(val, in[i]); - if (out[i] != val) { + if (out[i] != val) + { std::cout << "\n\t result -- WRONG\n"; - std::cout << "\t" << out[i] << " != " << val - << " (at index " << i << ")\n"; + std::cout << "\t" << out[i] << " != " << val << " (at index " << i + << ")\n"; } } std::cout << "\n\t result -- CORRECT\n"; @@ -349,11 +347,13 @@ template void checkExclusiveScanResult(const T* in, const T* out, int N) { T val = Function::identity(); - for (int i = 0; i < N; ++i) { - if (out[i] != val) { + for (int i = 0; i < N; ++i) + { + if (out[i] != val) + { std::cout << "\n\t result -- WRONG\n"; - std::cout << "\t" << out[i] << " != " << val - << " (at index " << i << ")\n"; + std::cout << "\t" << out[i] << " != " << val << " (at index " << i + << ")\n"; } val = Function()(val, in[i]); } @@ -367,6 +367,9 @@ template void printArray(const T* v, int N) { std::cout << std::endl; - for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; } + for (int i = 0; i < N; ++i) + { + std::cout << " " << v[i]; + } std::cout << std::endl; } diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp index b7c0c26458..490be37848 100644 --- a/exercises/segment-indexset-basics.cpp +++ b/exercises/segment-indexset-basics.cpp @@ -20,9 +20,9 @@ * * In this exercise, you will learn how to create RAJA segments and index sets * and use them to execute kernels. There are no computations performed in the - * exercises and no parallel execution. The kernels contain only print + * exercises and no parallel execution. The kernels contain only print * statements to illustrate various iteration patterns. Thus, all kernels - * look the same. The only thing that changes in these versions is the object + * look the same. The only thing that changes in these versions is the object * passed to the 'forall' method that defines the iteration space. * * RAJA features shown: @@ -43,59 +43,58 @@ using IdxType = int; using RangeSegType = RAJA::TypedRangeSegment; using RangeStrideSegType = RAJA::TypedRangeStrideSegment; using ListSegType = RAJA::TypedListSegment; -using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >; +using IndexSetType = RAJA::TypedIndexSet; // _raja_segment_type_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA segments index sets and index sets...\n"; -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. camp::resources::Resource host_res{camp::resources::Host()}; -//----------------------------------------------------------------------------// -// Stride-1 iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Stride-1 iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version range kernel...\n"; // _cstyle_range1_start - for (IdxType i = 0; i < 20; i++) { - std::cout << i << " "; + for (IdxType i = 0; i < 20; i++) + { + std::cout << i << " "; } // _cstyle_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA range kernel...\n"; // _raja_range1_start - RAJA::forall(RangeSegType(0, 20), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeSegType(0, 20), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 range kernel...\n"; // _raja_striderange1_start - RAJA::forall(RangeStrideSegType(0, 20, 1), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 1), + [=](IdxType i) { std::cout << i << " "; }); // _raja_striderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 list kernel...\n"; @@ -104,47 +103,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Collect indices in a vector to create list segment // std::vector idx; - for (IdxType i = 0; i < 20; ++i) { - idx.push_back(i); - } + for (IdxType i = 0; i < 20; ++i) + { + idx.push_back(i); + } - ListSegType idx_list1( idx, host_res ); + ListSegType idx_list1(idx, host_res); - RAJA::forall(idx_list1, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1, + [=](IdxType i) { std::cout << i << " "; }); // _raja_list1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-style stride-1 list kernel...\n"; // _cstyle_list1_start - IdxType iis = static_cast(idx.size()); // to avoid compiler warning - for (IdxType ii = 0; ii < iis; ++ii) { - std::cout << idx[ ii ] << " "; + IdxType iis = static_cast(idx.size()); // to avoid compiler warning + for (IdxType ii = 0; ii < iis; ++ii) + { + std::cout << idx[ii] << " "; } // _cstyle_list1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Negative stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Negative stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version negative stride kernel...\n"; // _cstyle_negstriderange1_start - for (IdxType i = 19; i > -1; i--) { + for (IdxType i = 19; i > -1; i--) + { std::cout << i << " "; } // _cstyle_negstriderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA negative stride kernel...\n"; @@ -156,9 +157,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << std::endl; -//----------------------------------// -// List variant -//----------------------------------// + //----------------------------------// + // List variant + //----------------------------------// std::cout << "\n Running RAJA negative stride list kernel...\n"; @@ -166,43 +167,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Reverse the order of indices in the vector // - std::reverse( idx.begin(), idx.end() ); - ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res ); + std::reverse(idx.begin(), idx.end()); + ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res); - RAJA::forall(idx_list1_reverse, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1_reverse, + [=](IdxType i) { std::cout << i << " "; }); // _raja_negstridelist1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Non-unit uniform stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Non-unit uniform stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version stride-2 range kernel...\n"; // _cstyle_range2_start - for (IdxType i = 0; i < 20; i += 2) { + for (IdxType i = 0; i < 20; i += 2) + { std::cout << i << " "; } // _cstyle_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-2 range kernel...\n"; // _raja_range2_start - RAJA::forall(RangeStrideSegType(0, 20, 2), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 2), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-3 range kernel...\n"; @@ -214,50 +214,50 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << std::endl; -//----------------------------------------------------------------------------// -// IndexSets: complex iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // IndexSets: complex iteration spaces + //----------------------------------------------------------------------------// -// -// Sequential index set execution policy used in several of the following -// example implementations. -// + // + // Sequential index set execution policy used in several of the following + // example implementations. + // // _raja_seq_indexset_policy_start - using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; + using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; // _raja_seq_indexset_policy__end std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n"; // _raja_indexset_2ranges_start IndexSetType is2; - is2.push_back( RangeSegType(0, 10) ); - is2.push_back( RangeSegType(15, 20) ); - - RAJA::forall(is2, [=] (IdxType i) { - std::cout << i << " "; - }); + is2.push_back(RangeSegType(0, 10)); + is2.push_back(RangeSegType(15, 20)); + + RAJA::forall(is2, + [=](IdxType i) { std::cout << i << " "; }); // _raja_indexset_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-version of two segment kernel...\n"; // _cstyle_2ranges_start - for (IdxType i = 0; i < 10; ++i) { + for (IdxType i = 0; i < 10; ++i) + { std::cout << i << " "; } - for (IdxType i = 15; i < 20; ++i) { + for (IdxType i = 15; i < 20; ++i) + { std::cout << i << " "; } // _cstyle_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA index set (3 segments) kernel...\n"; @@ -265,20 +265,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Make a RAJA version of a kernel that prints the sequence - /// + /// /// 0 1 2 3 4 5 6 7 10 11 14 20 22 24 25 26 27 /// - /// using a RAJA::TypedIndexSet containing two - /// RAJA::TypedRangeSegment objects and on - /// RAJA::TypedListSegment object. + /// using a RAJA::TypedIndexSet containing two + /// RAJA::TypedRangeSegment objects and on + /// RAJA::TypedListSegment object. /// std::cout << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; - + return 0; } - diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp index 4267582d98..d3bf08ec52 100644 --- a/exercises/segment-indexset-basics_solution.cpp +++ b/exercises/segment-indexset-basics_solution.cpp @@ -20,9 +20,9 @@ * * In this exercise, you will learn how to create RAJA segments and index sets * and use them to execute kernels. There are no computations performed in the - * exercises and no parallel execution. The kernels contain only print + * exercises and no parallel execution. The kernels contain only print * statements to illustrate various iteration patterns. Thus, all kernels - * look the same. The only thing that changes in these versions is the object + * look the same. The only thing that changes in these versions is the object * passed to the 'forall' method that defines the iteration space. * * RAJA features shown: @@ -43,59 +43,58 @@ using IdxType = int; using RangeSegType = RAJA::TypedRangeSegment; using RangeStrideSegType = RAJA::TypedRangeStrideSegment; using ListSegType = RAJA::TypedListSegment; -using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >; +using IndexSetType = RAJA::TypedIndexSet; // _raja_segment_type_end -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA segments index sets and index sets...\n"; -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. camp::resources::Resource host_res{camp::resources::Host()}; -//----------------------------------------------------------------------------// -// Stride-1 iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Stride-1 iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version range kernel...\n"; -// _cstyle_range1_start - for (IdxType i = 0; i < 20; i++) { - std::cout << i << " "; + // _cstyle_range1_start + for (IdxType i = 0; i < 20; i++) + { + std::cout << i << " "; } -// _cstyle_range1_end + // _cstyle_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA range kernel...\n"; // _raja_range1_start - RAJA::forall(RangeSegType(0, 20), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeSegType(0, 20), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 range kernel...\n"; // _raja_striderange1_start - RAJA::forall(RangeStrideSegType(0, 20, 1), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 1), + [=](IdxType i) { std::cout << i << " "; }); // _raja_striderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-1 list kernel...\n"; @@ -104,61 +103,62 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Collect indices in a vector to create list segment // std::vector idx; - for (IdxType i = 0; i < 20; ++i) { - idx.push_back(i); - } + for (IdxType i = 0; i < 20; ++i) + { + idx.push_back(i); + } - ListSegType idx_list1( idx, host_res ); + ListSegType idx_list1(idx, host_res); - RAJA::forall(idx_list1, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1, + [=](IdxType i) { std::cout << i << " "; }); // _raja_list1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-style stride-1 list kernel...\n"; // _cstyle_list1_start - IdxType iis = static_cast(idx.size()); // to avoid compiler warning - for (IdxType ii = 0; ii < iis; ++ii) { - std::cout << idx[ ii ] << " "; + IdxType iis = static_cast(idx.size()); // to avoid compiler warning + for (IdxType ii = 0; ii < iis; ++ii) + { + std::cout << idx[ii] << " "; } // _cstyle_list1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Negative stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Negative stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version negative stride kernel...\n"; // _cstyle_negstriderange1_start - for (IdxType i = 19; i > -1; i--) { + for (IdxType i = 19; i > -1; i--) + { std::cout << i << " "; } // _cstyle_negstriderange1_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA negative stride kernel...\n"; // _raja_negstriderange1_start - RAJA::forall(RangeStrideSegType(19, -1, -1), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(19, -1, -1), + [=](IdxType i) { std::cout << i << " "; }); // _raja_negstriderange1_end std::cout << std::endl; -//----------------------------------// -// List variant -//----------------------------------// + //----------------------------------// + // List variant + //----------------------------------// std::cout << "\n Running RAJA negative stride list kernel...\n"; @@ -166,121 +166,117 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Reverse the order of indices in the vector // - std::reverse( idx.begin(), idx.end() ); - ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res ); + std::reverse(idx.begin(), idx.end()); + ListSegType idx_list1_reverse(&idx[0], idx.size(), host_res); - RAJA::forall(idx_list1_reverse, [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(idx_list1_reverse, + [=](IdxType i) { std::cout << i << " "; }); // _raja_negstridelist1_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// Non-unit uniform stride iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Non-unit uniform stride iteration spaces + //----------------------------------------------------------------------------// std::cout << "\n Running C-version stride-2 range kernel...\n"; // _cstyle_range2_start - for (IdxType i = 0; i < 20; i += 2) { + for (IdxType i = 0; i < 20; i += 2) + { std::cout << i << " "; } // _cstyle_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-2 range kernel...\n"; // _raja_range2_start - RAJA::forall(RangeStrideSegType(0, 20, 2), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 2), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range2_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA stride-3 range kernel...\n"; // _raja_range3_start - RAJA::forall(RangeStrideSegType(0, 20, 3), [=] (IdxType i) { - std::cout << i << " "; - }); + RAJA::forall(RangeStrideSegType(0, 20, 3), + [=](IdxType i) { std::cout << i << " "; }); // _raja_range3_end std::cout << std::endl; -//----------------------------------------------------------------------------// -// IndexSets: complex iteration spaces -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // IndexSets: complex iteration spaces + //----------------------------------------------------------------------------// -// -// Sequential index set execution policy used in several of the following -// example implementations. -// + // + // Sequential index set execution policy used in several of the following + // example implementations. + // std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n"; // _raja_indexset_2ranges_start - using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; + using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; IndexSetType is2; - is2.push_back( RangeSegType(0, 10) ); - is2.push_back( RangeSegType(15, 20) ); - - RAJA::forall(is2, [=] (IdxType i) { - std::cout << i << " "; - }); + is2.push_back(RangeSegType(0, 10)); + is2.push_back(RangeSegType(15, 20)); + + RAJA::forall(is2, + [=](IdxType i) { std::cout << i << " "; }); // _raja_indexset_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running C-version of two segment kernel...\n"; // _cstyle_2ranges_start - for (IdxType i = 0; i < 10; ++i) { + for (IdxType i = 0; i < 10; ++i) + { std::cout << i << " "; } - for (IdxType i = 15; i < 20; ++i) { + for (IdxType i = 15; i < 20; ++i) + { std::cout << i << " "; } // _cstyle_2ranges_end std::cout << std::endl; -//----------------------------------// + //----------------------------------// std::cout << "\n Running RAJA index set (3 segments) kernel...\n"; // _raja_indexset_3segs_start IndexSetType is3; - is3.push_back( RangeSegType(0, 8) ); + is3.push_back(RangeSegType(0, 8)); - IdxType indx[ ] = {10, 11, 14, 20, 22}; - ListSegType list2( indx, 5, host_res ); - is3.push_back( list2 ); + IdxType indx[] = {10, 11, 14, 20, 22}; + ListSegType list2(indx, 5, host_res); + is3.push_back(list2); - is3.push_back( RangeSegType(24, 28) ); - - RAJA::forall(is3, [=] (IdxType i) { - std::cout << i << " "; - }); + is3.push_back(RangeSegType(24, 28)); + + RAJA::forall(is3, + [=](IdxType i) { std::cout << i << " "; }); // _raja_indexset_3segs_end std::cout << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; - + return 0; } - diff --git a/exercises/sort.cpp b/exercises/sort.cpp index 21a5fb5edd..1b13eb20ac 100644 --- a/exercises/sort.cpp +++ b/exercises/sort.cpp @@ -8,10 +8,12 @@ #define OP_GREATER RAJA::operators::greater #define OP_LESS RAJA::operators::less -#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult(in, out, N) -#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult(in, out, in_vals, out_vals, N) -#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult(in, out, N) -#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult(in, out, in_vals, out_vals, N) +#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult(in, out, N) +#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) \ + checkUnstableSortResult(in, out, in_vals, out_vals, N) +#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult(in, out, N) +#define CHECK_STABLE_SORT_PAIR_RESULT(X) \ + checkStableSortResult(in, out, in_vals, out_vals, N) #include #include @@ -30,9 +32,9 @@ /* * Sort Exercise * - * Exercise demonstrates how to perform RAJA unstable and stable sort operations - * for integer arrays, including pairs variant, using different comparators. - * Other array data types, comparators, etc. are similar + * Exercise demonstrates how to perform RAJA unstable and stable sort + * operations for integer arrays, including pairs variant, using different + * comparators. Other array data types, comparators, etc. are similar * * RAJA features shown: * - `RAJA::sort` and `RAJA::sort_pairs` methods @@ -47,11 +49,11 @@ Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//constexpr int CUDA_BLOCK_SIZE = 16; +// constexpr int CUDA_BLOCK_SIZE = 16; #endif #if defined(RAJA_ENABLE_HIP) -//constexpr int HIP_BLOCK_SIZE = 16; +// constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -60,14 +62,20 @@ template void checkUnstableSortResult(const T* in, const T* out, int N); template -void checkUnstableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkUnstableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void checkStableSortResult(const T* in, const T* out, int N); template -void checkStableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkStableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void printArray(const T* k, int N); @@ -81,27 +89,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA sort example...\n"; // _sort_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// + // + // Allocate and initialize vector data + // int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); unsigned* in_vals = memoryManager::allocate(N); unsigned* out_vals = memoryManager::allocate(N); - std::iota(in , in + N/2, 0); - std::iota(in + N/2, in + N , 0); - std::shuffle(in , in + N/2, std::mt19937{12345u}); - std::shuffle(in + N/2, in + N , std::mt19937{67890u}); + std::iota(in, in + N / 2, 0); + std::iota(in + N / 2, in + N, 0); + std::shuffle(in, in + N / 2, std::mt19937{12345u}); + std::shuffle(in + N / 2, in + N, std::mt19937{67890u}); - std::fill(in_vals , in_vals + N/2, 0); - std::fill(in_vals + N/2, in_vals + N , 1); + std::fill(in_vals, in_vals + N / 2, 0); + std::fill(in_vals + N / 2, in_vals + N, 1); std::cout << "\n in keys...\n"; printArray(in, N); @@ -112,10 +120,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _sort_array_init_end -//----------------------------------------------------------------------------// -// Perform various sequential sorts to illustrate unstable/stable, -// pairs, default sorts with different comparators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential sorts to illustrate unstable/stable, + // pairs, default sorts with different comparators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (default)...\n"; @@ -123,7 +131,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec - /// execution policy type. + /// execution policy type. /// /// NOTE: We've done this one for you to help you get started... /// @@ -134,12 +142,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::sort(RAJA::make_span(out, N)); // _sort_seq_end - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (non-decreasing)...\n"; @@ -149,15 +157,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; @@ -167,15 +175,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-increasing)...\n"; @@ -185,15 +193,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution - /// policy type and an explicit greater operation. + /// policy type and an explicit greater operation. /// - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n"; @@ -204,15 +212,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA pair sort with RAJA::seq_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n"; @@ -223,10 +232,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA pair sort with RAJA::seq_exec execution - /// policy type and an explicit greater operation. + /// policy type and an explicit greater operation. /// - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; @@ -234,9 +244,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP sort (non-decreasing)...\n"; @@ -246,15 +256,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA sort with RAJA::omp_parallel_for_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n"; @@ -264,24 +274,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec execution - /// policy type and an explicit greater operation. + /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec + /// execution + /// policy type and an explicit greater operation. /// - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a couple of CUDA sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of CUDA sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n"; @@ -292,18 +304,19 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a RAJA pair sort with RAJA::cuda_exec execution - /// policy type and an explicit greater operation. + /// policy type and an explicit greater operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n"; @@ -313,26 +326,26 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement a stable RAJA pair sort with RAJA::cuda_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n"; @@ -342,48 +355,51 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* d_out = memoryManager::allocate_gpu(N); int* d_out_vals = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement a RAJA pair sort with RAJA::hip_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); - hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); + hipErrchk( + hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP stable_sort (non-increasing)...\n"; std::copy_n(in, N, out); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... /// /// EXERCISE: Implement a stable RAJA sort with RAJA::hip_exec execution - /// policy type and an explicit less operation. + /// policy type and an explicit less operation. /// - /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the /// top of the file if you want to use it here. /// - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; @@ -394,11 +410,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); @@ -428,9 +444,11 @@ void checkUnstableSortResult(const T* in, const T* out, int N) // make map of keys to keys using val_map = std::unordered_multiset; std::unordered_map keys; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys.find(in[i]); - if (key_iter == keys.end()) { + if (key_iter == keys.end()) + { auto ret = keys.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -438,54 +456,60 @@ void checkUnstableSortResult(const T* in, const T* out, int N) key_iter->second.emplace(in[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i-1] << ", " << out[i] - << " out of order" - << " (at index " << i-1 << ")\n"; + std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order" + << " (at index " << i - 1 << ")\n"; } // test there is an item with this auto key_iter = keys.find(out[i]); - if (key_iter == keys.end()) { - if (correct) { + if (key_iter == keys.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " unknown or duplicate key" + std::cout << "\t" << out[i] << " unknown or duplicate key" << " (at index " << i << ")\n"; } auto val_iter = key_iter->second.find(out[i]); - if (val_iter == key_iter->second.end()) { - if (correct) { + if (val_iter == key_iter->second.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " unknown or duplicate val" + std::cout << "\t" << out[i] << " unknown or duplicate val" << " (at index " << i << ")\n"; } key_iter->second.erase(val_iter); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } template -void checkUnstableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N) +void checkUnstableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N) { Comparator comp; bool correct = true; @@ -493,9 +517,11 @@ void checkUnstableSortResult(const T* in, const T* out, // make map of keys to vals using val_map = std::unordered_multiset; std::unordered_map keys_to_vals; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys_to_vals.find(in[i]); - if (key_iter == keys_to_vals.end()) { + if (key_iter == keys_to_vals.end()) + { auto ret = keys_to_vals.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -503,48 +529,57 @@ void checkUnstableSortResult(const T* in, const T* out, key_iter->second.emplace(in_vals[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i-1] << "," << out_vals[i-1] << ")," - << " (" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i - 1] << "," << out_vals[i - 1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" << " out of order" - << " (at index " << i-1 << ")\n"; + << " (at index " << i - 1 << ")\n"; } // test there is a pair with this key and val auto key_iter = keys_to_vals.find(out[i]); - if (key_iter == keys_to_vals.end()) { - if (correct) { + if (key_iter == keys_to_vals.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " unknown or duplicate key" << " (at index " << i << ")\n"; } auto val_iter = key_iter->second.find(out_vals[i]); - if (val_iter == key_iter->second.end()) { - if (correct) { + if (val_iter == key_iter->second.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " unknown or duplicate val" << " (at index " << i << ")\n"; } key_iter->second.erase(val_iter); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys_to_vals.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } @@ -561,9 +596,11 @@ void checkStableSortResult(const T* in, const T* out, int N) // make map of keys to keys using val_map = std::list; std::unordered_map keys; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys.find(in[i]); - if (key_iter == keys.end()) { + if (key_iter == keys.end()) + { auto ret = keys.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -571,53 +608,59 @@ void checkStableSortResult(const T* in, const T* out, int N) key_iter->second.emplace_back(in[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i-1] << ", " << out[i] - << " out of order " - << " (at index " << i-1 << ")\n"; + std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order " + << " (at index " << i - 1 << ")\n"; } // test there is an item with this auto key_iter = keys.find(out[i]); - if (key_iter == keys.end()) { - if (correct) { + if (key_iter == keys.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " unknown or duplicate key " + std::cout << "\t" << out[i] << " unknown or duplicate key " << " (at index " << i << ")\n"; } - if (key_iter->second.front() != out[i]) { - if (correct) { + if (key_iter->second.front() != out[i]) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " out of stable order or unknown val " + std::cout << "\t" << out[i] << " out of stable order or unknown val " << " (at index " << i << ")\n"; } key_iter->second.pop_front(); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } template -void checkStableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N) +void checkStableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N) { Comparator comp; bool correct = true; @@ -625,9 +668,11 @@ void checkStableSortResult(const T* in, const T* out, // make map of keys to vals using val_map = std::list; std::unordered_map keys_to_vals; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys_to_vals.find(in[i]); - if (key_iter == keys_to_vals.end()) { + if (key_iter == keys_to_vals.end()) + { auto ret = keys_to_vals.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -635,47 +680,56 @@ void checkStableSortResult(const T* in, const T* out, key_iter->second.emplace_back(in_vals[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i-1] << "," << out_vals[i-1] << ")," - << " (" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i - 1] << "," << out_vals[i - 1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" << " out of order " - << " (at index " << i-1 << ")\n"; + << " (at index " << i - 1 << ")\n"; } // test there is a pair with this key and val auto key_iter = keys_to_vals.find(out[i]); - if (key_iter == keys_to_vals.end()) { - if (correct) { + if (key_iter == keys_to_vals.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " unknown or duplicate key " << " (at index " << i << ")\n"; } - if (key_iter->second.front() != out_vals[i]) { - if (correct) { + if (key_iter->second.front() != out_vals[i]) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " out of stable order or unknown val " << " (at index " << i << ")\n"; } key_iter->second.pop_front(); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys_to_vals.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } @@ -688,7 +742,10 @@ template void printArray(const T* k, int N) { std::cout << std::endl; - for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; } + for (int i = 0; i < N; ++i) + { + std::cout << " " << k[i]; + } std::cout << std::endl; } /// @@ -696,7 +753,9 @@ template void printArray(const T* k, const U* v, int N) { std::cout << std::endl; - for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; } + for (int i = 0; i < N; ++i) + { + std::cout << " (" << k[i] << "," << v[i] << ")"; + } std::cout << std::endl; } - diff --git a/exercises/sort_solution.cpp b/exercises/sort_solution.cpp index 98f65c6dbe..5414885e67 100644 --- a/exercises/sort_solution.cpp +++ b/exercises/sort_solution.cpp @@ -8,10 +8,12 @@ #define OP_GREATER RAJA::operators::greater #define OP_LESS RAJA::operators::less -#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult(in, out, N) -#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult(in, out, in_vals, out_vals, N) -#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult(in, out, N) -#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult(in, out, in_vals, out_vals, N) +#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult(in, out, N) +#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) \ + checkUnstableSortResult(in, out, in_vals, out_vals, N) +#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult(in, out, N) +#define CHECK_STABLE_SORT_PAIR_RESULT(X) \ + checkStableSortResult(in, out, in_vals, out_vals, N) #include #include @@ -30,9 +32,9 @@ /* * Sort Exercise * - * Exercise demonstrates how to perform RAJA unstable and stable sort operations - * for integer arrays, including pairs variant, using different comparators. - * Other array data types, comparators, etc. are similar + * Exercise demonstrates how to perform RAJA unstable and stable sort + * operations for integer arrays, including pairs variant, using different + * comparators. Other array data types, comparators, etc. are similar * * RAJA features shown: * - `RAJA::sort` and `RAJA::sort_pairs` methods @@ -60,14 +62,20 @@ constexpr int HIP_BLOCK_SIZE = 16; template void checkUnstableSortResult(const T* in, const T* out, int N); template -void checkUnstableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkUnstableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void checkStableSortResult(const T* in, const T* out, int N); template -void checkStableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N); +void checkStableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N); // template void printArray(const T* k, int N); @@ -81,27 +89,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nRAJA sort example...\n"; // _sort_array_init_start -// -// Define array length -// + // + // Define array length + // constexpr int N = 20; -// -// Allocate and initialize vector data -// + // + // Allocate and initialize vector data + // int* in = memoryManager::allocate(N); int* out = memoryManager::allocate(N); unsigned* in_vals = memoryManager::allocate(N); unsigned* out_vals = memoryManager::allocate(N); - std::iota(in , in + N/2, 0); - std::iota(in + N/2, in + N , 0); - std::shuffle(in , in + N/2, std::mt19937{12345u}); - std::shuffle(in + N/2, in + N , std::mt19937{67890u}); + std::iota(in, in + N / 2, 0); + std::iota(in + N / 2, in + N, 0); + std::shuffle(in, in + N / 2, std::mt19937{12345u}); + std::shuffle(in + N / 2, in + N, std::mt19937{67890u}); - std::fill(in_vals , in_vals + N/2, 0); - std::fill(in_vals + N/2, in_vals + N , 1); + std::fill(in_vals, in_vals + N / 2, 0); + std::fill(in_vals + N / 2, in_vals + N, 1); std::cout << "\n in keys...\n"; printArray(in, N); @@ -112,10 +120,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _sort_array_init_end -//----------------------------------------------------------------------------// -// Perform various sequential sorts to illustrate unstable/stable, -// pairs, default sorts with different comparators -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform various sequential sorts to illustrate unstable/stable, + // pairs, default sorts with different comparators + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (default)...\n"; @@ -125,12 +133,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::sort(RAJA::make_span(out, N)); // _sort_seq_end - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort (non-decreasing)...\n"; @@ -141,12 +149,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_seq_less_end - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; @@ -157,12 +165,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_stable_seq_less_end - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort (non-increasing)...\n"; @@ -173,12 +181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::greater{}); // _sort_stable_seq_greater_end - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n"; @@ -191,12 +199,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_pairs_seq_less_end - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n"; @@ -209,7 +218,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::greater{}); // _sort_stable_pairs_seq_greater_end - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; @@ -217,9 +227,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) -//----------------------------------------------------------------------------// -// Perform a couple of OpenMP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of OpenMP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP sort (non-decreasing)...\n"; @@ -230,12 +240,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_omp_less_end - //checkUnstableSortResult>(in, out, N); + // checkUnstableSortResult>(in, out, N); CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n"; @@ -243,25 +253,27 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_stable_pairs_omp_greater_start - RAJA::stable_sort_pairs(RAJA::make_span(out, N), - RAJA::make_span(out_vals, N), - RAJA::operators::greater{}); + RAJA::stable_sort_pairs( + RAJA::make_span(out, N), + RAJA::make_span(out_vals, N), + RAJA::operators::greater{}); // _sort_stable_pairs_omp_greater_end - //checkStableSortResult>(in, out, in_vals, out_vals, N); + // checkStableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -//----------------------------------------------------------------------------// -// Perform a couple of CUDA sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of CUDA sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n"; @@ -269,41 +281,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in_vals, N, out_vals); // _sort_pairs_cuda_greater_start - RAJA::sort_pairs>(RAJA::make_span(out, N), - RAJA::make_span(out_vals, N), - RAJA::operators::greater{}); + RAJA::sort_pairs>( + RAJA::make_span(out, N), + RAJA::make_span(out_vals, N), + RAJA::operators::greater{}); // _sort_pairs_cuda_greater_end - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n"; std::copy_n(in, N, out); // _sort_stable_cuda_less_start - RAJA::stable_sort>(RAJA::make_span(out, N), - RAJA::operators::less{}); + RAJA::stable_sort>( + RAJA::make_span(out, N), RAJA::operators::less{}); // _sort_stable_cuda_less_end - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -//----------------------------------------------------------------------------// -// Perform a couple of HIP sorts... -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Perform a couple of HIP sorts... + //----------------------------------------------------------------------------// std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n"; @@ -313,38 +327,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* d_out = memoryManager::allocate_gpu(N); int* d_out_vals = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk( + hipMemcpy(d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice)); - RAJA::sort_pairs>(RAJA::make_span(d_out, N), - RAJA::make_span(d_out_vals, N), - RAJA::operators::less{}); + RAJA::sort_pairs>( + RAJA::make_span(d_out, N), + RAJA::make_span(d_out_vals, N), + RAJA::operators::less{}); - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); - hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); + hipErrchk( + hipMemcpy(out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + // checkUnstableSortResult>(in, out, in_vals, + // out_vals, N); CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running HIP stable_sort (non-increasing)...\n"; std::copy_n(in, N, out); - hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_out, out, N * sizeof(int), hipMemcpyHostToDevice)); // _sort_stable_hip_greater_start RAJA::stable_sort>( - RAJA::make_span(d_out, N), - RAJA::operators::greater{}); + RAJA::make_span(d_out, N), RAJA::operators::greater{}); // _sort_stable_hip_greater_end - hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(out, d_out, N * sizeof(int), hipMemcpyDeviceToHost)); - //checkStableSortResult>(in, out, N); + // checkStableSortResult>(in, out, N); CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; @@ -355,11 +372,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(in); memoryManager::deallocate(out); @@ -389,9 +406,11 @@ void checkUnstableSortResult(const T* in, const T* out, int N) // make map of keys to keys using val_map = std::unordered_multiset; std::unordered_map keys; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys.find(in[i]); - if (key_iter == keys.end()) { + if (key_iter == keys.end()) + { auto ret = keys.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -399,54 +418,60 @@ void checkUnstableSortResult(const T* in, const T* out, int N) key_iter->second.emplace(in[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i-1] << ", " << out[i] - << " out of order" - << " (at index " << i-1 << ")\n"; + std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order" + << " (at index " << i - 1 << ")\n"; } // test there is an item with this auto key_iter = keys.find(out[i]); - if (key_iter == keys.end()) { - if (correct) { + if (key_iter == keys.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " unknown or duplicate key" + std::cout << "\t" << out[i] << " unknown or duplicate key" << " (at index " << i << ")\n"; } auto val_iter = key_iter->second.find(out[i]); - if (val_iter == key_iter->second.end()) { - if (correct) { + if (val_iter == key_iter->second.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " unknown or duplicate val" + std::cout << "\t" << out[i] << " unknown or duplicate val" << " (at index " << i << ")\n"; } key_iter->second.erase(val_iter); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } template -void checkUnstableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N) +void checkUnstableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N) { Comparator comp; bool correct = true; @@ -454,9 +479,11 @@ void checkUnstableSortResult(const T* in, const T* out, // make map of keys to vals using val_map = std::unordered_multiset; std::unordered_map keys_to_vals; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys_to_vals.find(in[i]); - if (key_iter == keys_to_vals.end()) { + if (key_iter == keys_to_vals.end()) + { auto ret = keys_to_vals.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -464,48 +491,57 @@ void checkUnstableSortResult(const T* in, const T* out, key_iter->second.emplace(in_vals[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i-1] << "," << out_vals[i-1] << ")," - << " (" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i - 1] << "," << out_vals[i - 1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" << " out of order" - << " (at index " << i-1 << ")\n"; + << " (at index " << i - 1 << ")\n"; } // test there is a pair with this key and val auto key_iter = keys_to_vals.find(out[i]); - if (key_iter == keys_to_vals.end()) { - if (correct) { + if (key_iter == keys_to_vals.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " unknown or duplicate key" << " (at index " << i << ")\n"; } auto val_iter = key_iter->second.find(out_vals[i]); - if (val_iter == key_iter->second.end()) { - if (correct) { + if (val_iter == key_iter->second.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " unknown or duplicate val" << " (at index " << i << ")\n"; } key_iter->second.erase(val_iter); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys_to_vals.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } @@ -522,9 +558,11 @@ void checkStableSortResult(const T* in, const T* out, int N) // make map of keys to keys using val_map = std::list; std::unordered_map keys; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys.find(in[i]); - if (key_iter == keys.end()) { + if (key_iter == keys.end()) + { auto ret = keys.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -532,53 +570,59 @@ void checkStableSortResult(const T* in, const T* out, int N) key_iter->second.emplace_back(in[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i-1] << ", " << out[i] - << " out of order " - << " (at index " << i-1 << ")\n"; + std::cout << "\t" << out[i - 1] << ", " << out[i] << " out of order " + << " (at index " << i - 1 << ")\n"; } // test there is an item with this auto key_iter = keys.find(out[i]); - if (key_iter == keys.end()) { - if (correct) { + if (key_iter == keys.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " unknown or duplicate key " + std::cout << "\t" << out[i] << " unknown or duplicate key " << " (at index " << i << ")\n"; } - if (key_iter->second.front() != out[i]) { - if (correct) { + if (key_iter->second.front() != out[i]) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } - std::cout << "\t" - << out[i] - << " out of stable order or unknown val " + std::cout << "\t" << out[i] << " out of stable order or unknown val " << " (at index " << i << ")\n"; } key_iter->second.pop_front(); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } template -void checkStableSortResult(const T* in, const T* out, - const U* in_vals, const U* out_vals, int N) +void checkStableSortResult(const T* in, + const T* out, + const U* in_vals, + const U* out_vals, + int N) { Comparator comp; bool correct = true; @@ -586,9 +630,11 @@ void checkStableSortResult(const T* in, const T* out, // make map of keys to vals using val_map = std::list; std::unordered_map keys_to_vals; - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { auto key_iter = keys_to_vals.find(in[i]); - if (key_iter == keys_to_vals.end()) { + if (key_iter == keys_to_vals.end()) + { auto ret = keys_to_vals.emplace(in[i], val_map{}); assert(ret.second); key_iter = ret.first; @@ -596,47 +642,56 @@ void checkStableSortResult(const T* in, const T* out, key_iter->second.emplace_back(in_vals[i]); } - for (RAJA::Index_type i = 0; i < N; i++) { + for (RAJA::Index_type i = 0; i < N; i++) + { // test ordering - if (i > 0 && comp(out[i], out[i-1])) { - if (correct) { + if (i > 0 && comp(out[i], out[i - 1])) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i-1] << "," << out_vals[i-1] << ")," - << " (" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i - 1] << "," << out_vals[i - 1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" << " out of order " - << " (at index " << i-1 << ")\n"; + << " (at index " << i - 1 << ")\n"; } // test there is a pair with this key and val auto key_iter = keys_to_vals.find(out[i]); - if (key_iter == keys_to_vals.end()) { - if (correct) { + if (key_iter == keys_to_vals.end()) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " unknown or duplicate key " << " (at index " << i << ")\n"; } - if (key_iter->second.front() != out_vals[i]) { - if (correct) { + if (key_iter->second.front() != out_vals[i]) + { + if (correct) + { std::cout << "\n\t result -- WRONG\n"; correct = false; } std::cout << "\t" - << "(" << out[i] << "," << out_vals[i] << ")" + << "(" << out[i] << "," << out_vals[i] << ")" << " out of stable order or unknown val " << " (at index " << i << ")\n"; } key_iter->second.pop_front(); - if (key_iter->second.size() == 0) { + if (key_iter->second.size() == 0) + { keys_to_vals.erase(key_iter); } } - if (correct) { + if (correct) + { std::cout << "\n\t result -- CORRECT\n"; } } @@ -649,7 +704,10 @@ template void printArray(const T* k, int N) { std::cout << std::endl; - for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; } + for (int i = 0; i < N; ++i) + { + std::cout << " " << k[i]; + } std::cout << std::endl; } @@ -657,7 +715,9 @@ template void printArray(const T* k, const U* v, int N) { std::cout << std::endl; - for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; } + for (int i = 0; i < N; ++i) + { + std::cout << " (" << k[i] << "," << v[i] << ")"; + } std::cout << std::endl; } - diff --git a/exercises/tutorial_halfday/ex2_approx-pi.cpp b/exercises/tutorial_halfday/ex2_approx-pi.cpp index c1ccc05aee..f5487fd9f9 100644 --- a/exercises/tutorial_halfday/ex2_approx-pi.cpp +++ b/exercises/tutorial_halfday/ex2_approx-pi.cpp @@ -15,7 +15,7 @@ * EXERCISE #2: Approximate pi using a Riemann sum * * In this exercise, you will apprimate pi using the formula - * + * * pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the * interval [0, 1]. * @@ -28,7 +28,7 @@ * - `forall` loop iteration template method * - Index range segment * - Sum reduction - * - Execution and reduction policies + * - Execution and reduction policies */ /* @@ -46,38 +46,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n"; -// -// Define number of subintervals (N) and size of each subinterval (dx) used in -// Riemann integral sum to approximate pi. -// + // + // Define number of subintervals (N) and size of each subinterval (dx) used in + // Riemann integral sum to approximate pi. + // const int N = 512 * 512; - const double dx = 1.0 / double(N); + const double dx = 1.0 / double(N); -// Set precision for printing pi + // Set precision for printing pi int prec = 16; -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential pi approximation...\n"; - + double c_pi = 0.0; - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi += dx / (1.0 + x * x); + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi += dx / (1.0 + x * x); } c_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl; -//----------------------------------------------------------------------------// -// RAJA sequential variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential pi approximation...\n"; @@ -85,31 +85,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall - /// method with RAJA::seq_exec execution policy type and a + /// method with RAJA::seq_exec execution policy type and a /// RAJA::ReduceSum object with RAJA::seq_reduce policy type /// to accumulate the sum. /// /// NOTE: We've done this one for you to help you get started... /// - using EXEC_POL1 = RAJA::seq_exec; + using EXEC_POL1 = RAJA::seq_exec; using REDUCE_POL1 = RAJA::seq_reduce; - RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0); + RAJA::ReduceSum seq_pi(0.0); - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) { - double x = (double(i) + 0.5) * dx; - seq_pi += dx / (1.0 + x * x); + RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { + double x = (double(i) + 0.5) * dx; + seq_pi += dx / (1.0 + x * x); }); - double seq_pi_val = seq_pi.get() * 4.0; + double seq_pi_val = seq_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << seq_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl; -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -117,22 +116,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) double c_pi_omp = 0.0; - #pragma omp parallel for reduction(+:c_pi_omp) - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi_omp += dx / (1.0 + x * x); +#pragma omp parallel for reduction(+ : c_pi_omp) + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi_omp += dx / (1.0 + x * x); } c_pi_omp *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi_omp << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -142,23 +141,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall - /// method with RAJA::omp_parallel_for_exec execution policy type + /// method with RAJA::omp_parallel_for_exec execution policy type /// and a RAJA::ReduceSum object with RAJA::omp_reduce policy type /// to accumulate the sum. - /// + /// double omp_pi_val = 0.0; - std::cout << "\tpi = " << std::setprecision(prec) - << omp_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -168,16 +166,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the pi approximation kernel using a RAJA::forall - /// method with RAJA::cuda_exec execution policy type and a + /// method with RAJA::cuda_exec execution policy type and a /// RAJA::ReduceSum object with RAJA::cuda_reduce policy type /// to accumulate the sum. - /// + /// double cuda_pi_val = 0.0; - std::cout << "\tpi = " << std::setprecision(prec) - << cuda_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl; #endif diff --git a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp index 5654ffbea2..42a3895b48 100644 --- a/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp +++ b/exercises/tutorial_halfday/ex2_approx-pi_solution.cpp @@ -15,7 +15,7 @@ * EXERCISE #2: Approximate pi using a Riemann sum * * In this exercise, you will apprimate pi using the formula - * + * * pi/4 = atan(1) = integral (1/1+x^2) dx, where integral is over the * interval [0, 1]. * @@ -28,7 +28,7 @@ * - `forall` loop iteration template method * - Index range segment * - Sum reduction - * - Execution and reduction policies + * - Execution and reduction policies */ /* @@ -43,59 +43,58 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #2: Approximate pi using a Riemann sum...\n"; -// -// Define number of subintervals (N) and size of each subinterval (dx) used in -// Riemann integral sum to approximate pi. -// + // + // Define number of subintervals (N) and size of each subinterval (dx) used in + // Riemann integral sum to approximate pi. + // const int N = 512 * 512; - const double dx = 1.0 / double(N); + const double dx = 1.0 / double(N); -// Set precision for printing pi + // Set precision for printing pi int prec = 16; -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running C-style sequential pi approximation...\n"; - + double c_pi = 0.0; - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi += dx / (1.0 + x * x); + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi += dx / (1.0 + x * x); } c_pi *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi << std::endl; -//----------------------------------------------------------------------------// -// RAJA sequential variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant. + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential pi approximation...\n"; - using EXEC_POL1 = RAJA::seq_exec; - using REDUCE_POL1 = RAJA::seq_reduce; + using EXEC_POL1 = RAJA::seq_exec; + using REDUCE_POL1 = RAJA::seq_reduce; - RAJA::ReduceSum< REDUCE_POL1, double > seq_pi(0.0); + RAJA::ReduceSum seq_pi(0.0); - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=](int i) { - double x = (double(i) + 0.5) * dx; - seq_pi += dx / (1.0 + x * x); + RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { + double x = (double(i) + 0.5) * dx; + seq_pi += dx / (1.0 + x * x); }); double seq_pi_val = seq_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << seq_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << seq_pi_val << std::endl; -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -103,65 +102,63 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) double c_pi_omp = 0.0; - #pragma omp parallel for reduction(+:c_pi_omp) - for (int i = 0; i < N; ++i) { - double x = (double(i) + 0.5) * dx; - c_pi_omp += dx / (1.0 + x * x); +#pragma omp parallel for reduction(+ : c_pi_omp) + for (int i = 0; i < N; ++i) + { + double x = (double(i) + 0.5) * dx; + c_pi_omp += dx / (1.0 + x * x); } c_pi_omp *= 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << c_pi_omp << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << c_pi_omp << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP pi approximation...\n"; - using EXEC_POL2 = RAJA::omp_parallel_for_exec; + using EXEC_POL2 = RAJA::omp_parallel_for_exec; using REDUCE_POL2 = RAJA::omp_reduce; - RAJA::ReduceSum< REDUCE_POL2, double > omp_pi(0.0); + RAJA::ReduceSum omp_pi(0.0); - RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=](int i) { - double x = (double(i) + 0.5) * dx; - omp_pi += dx / (1.0 + x * x); + RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { + double x = (double(i) + 0.5) * dx; + omp_pi += dx / (1.0 + x * x); }); double omp_pi_val = omp_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << omp_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << omp_pi_val << std::endl; #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA pi approximation...\n"; - using EXEC_POL3 = RAJA::cuda_exec; + using EXEC_POL3 = RAJA::cuda_exec; using REDUCE_POL3 = RAJA::cuda_reduce; - RAJA::ReduceSum< REDUCE_POL3, double > cuda_pi(0.0); + RAJA::ReduceSum cuda_pi(0.0); - RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { - double x = (double(i) + 0.5) * dx; - cuda_pi += dx / (1.0 + x * x); + RAJA::forall(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { + double x = (double(i) + 0.5) * dx; + cuda_pi += dx / (1.0 + x * x); }); double cuda_pi_val = cuda_pi.get() * 4.0; - std::cout << "\tpi = " << std::setprecision(prec) - << cuda_pi_val << std::endl; + std::cout << "\tpi = " << std::setprecision(prec) << cuda_pi_val << std::endl; #endif diff --git a/exercises/tutorial_halfday/ex5_line-of-sight.cpp b/exercises/tutorial_halfday/ex5_line-of-sight.cpp index c17fb2eb8a..1d22a04dd9 100644 --- a/exercises/tutorial_halfday/ex5_line-of-sight.cpp +++ b/exercises/tutorial_halfday/ex5_line-of-sight.cpp @@ -24,30 +24,30 @@ * * Given an observation point X on a terrain map, and a set of points * {Y0, Y1, Y2, ...} along a ray starting at X, find which points on the - * terrain at Y0, Y1, etc. are visible from the point at X. A point is - * visible from the point at X if and only if there is no other point on the - * terrain that blocks its view from the point at X. More precisely, - * a point on the terrain at Y is visible from the point at X if and only if - * no other point on the terrain between X and Y has a greater vertical angle + * terrain at Y0, Y1, etc. are visible from the point at X. A point is + * visible from the point at X if and only if there is no other point on the + * terrain that blocks its view from the point at X. More precisely, + * a point on the terrain at Y is visible from the point at X if and only if + * no other point on the terrain between X and Y has a greater vertical angle * from the point at X than the point at Y. So although a point at Y may - * be at a higher altitude than all other points on the terrain between Y + * be at a higher altitude than all other points on the terrain between Y * and X, the point at Y may not be visible from the point at X. * - * Let 'altX' be the altidue at point X. Suppose we have a vector 'dist' - * such that dist[i] is the horizontal distance between X and Yi, and a - * vector 'alt' such that alt[i] is the altitude at point Yi. To solve - * the line of sight problem, we compute an angle vector 'ang', where + * Let 'altX' be the altidue at point X. Suppose we have a vector 'dist' + * such that dist[i] is the horizontal distance between X and Yi, and a + * vector 'alt' such that alt[i] is the altitude at point Yi. To solve + * the line of sight problem, we compute an angle vector 'ang', where * ang[i] = arctan( (alt[i] - altX)/(dist[i]). Next, we perform a "max" - * scan on the vector 'ang' to form the vector 'ang_max'. Then, the point + * scan on the vector 'ang' to form the vector 'ang_max'. Then, the point * at Yi is visible from the point at X if ang[i] >= ang_max[i]. Otherwise, * the point at Yi is not visible. * * This file contains a C-style sequential implementation of the solution to - * the line-of-sight problem. Where indicated by comments, you will fill in + * the line-of-sight problem. Where indicated by comments, you will fill in * sequential and OpenMP versions of the algorithm using a RAJA scan operation * to compute the 'ang_max' vector and a RAJA forall method to determine which - * points are/are not visible. If you have access to an NVIDIA GPU and a CUDA - * compiler, fill in the RAJA CUDA version of the algorithm also. + * points are/are not visible. If you have access to an NVIDIA GPU and a CUDA + * compiler, fill in the RAJA CUDA version of the algorithm also. * * RAJA features you will use: * - inclusive scan operations with 'max' operator @@ -96,52 +96,59 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* visible = memoryManager::allocate(N); int* visible_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - dist[i] = static_cast(i+1); - double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 ); - alt[i] = alt_fact * - static_cast( rand() ) / static_cast( RAND_MAX ); + for (int i = 0; i < N; ++i) + { + dist[i] = static_cast(i + 1); + double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1); + alt[i] = + alt_fact * static_cast(rand()) / static_cast(RAND_MAX); } // // Set angle array - // - for (int i = 0; i < N; ++i) { - ang[i] = atan2( alt[i], dist[i] ); // set angle in radians + // + for (int i = 0; i < N; ++i) + { + ang[i] = atan2(alt[i], dist[i]); // set angle in radians } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n"; std::memset(visible_ref, 0, N * sizeof(int)); ang_max[0] = ang[0]; - for (int i = 1; i < N; ++i) { - ang_max[i] = std::max(ang[i], ang_max[i-1]); + for (int i = 1; i < N; ++i) + { + ang_max[i] = std::max(ang[i], ang_max[i - 1]); } int num_visible = 0; - for (int i = 0; i < N; ++i) { - if ( ang[i] >= ang_max[i] ) { - visible_ref[i] = 1; - num_visible++; - } else { - visible_ref[i] = 0; - } + for (int i = 0; i < N; ++i) + { + if (ang[i] >= ang_max[i]) + { + visible_ref[i] = 1; + num_visible++; + } + else + { + visible_ref[i] = 0; + } } std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible_ref, N); + // printArray(visible_ref, N); -//----------------------------------------------------------------------------// -// RAJA sequential variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant + //----------------------------------------------------------------------------// std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n"; @@ -153,7 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs. - /// First, use a 'max' RAJA::inclusive_scan on the angle vector + /// First, use a 'max' RAJA::inclusive_scan on the angle vector /// with RAJA::seq_exec execution policy. Then, use a RAJA::forall /// template with the same execution policy to determine which /// points are visible. @@ -162,12 +169,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -181,23 +188,23 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs. - /// First, use a 'max' RAJA::inclusive_scan on the angle vector - /// with RAJA::omp_parallel_for_exec execution policy. Then, use - /// a RAJA::forall template with the same execution policy to + /// First, use a 'max' RAJA::inclusive_scan on the angle vector + /// with RAJA::omp_parallel_for_exec execution policy. Then, use + /// a RAJA::forall template with the same execution policy to /// determine which points are visible. /// num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -211,16 +218,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the line-of-sight algorithm using RAJA constructs. - /// First, use a 'max' RAJA::inclusive_scan on the angle vector - /// with RAJA::cuda_exec execution policy. Then, use a - /// RAJA::forall template with the same execution policy to + /// First, use a 'max' RAJA::inclusive_scan on the angle vector + /// with RAJA::cuda_exec execution policy. Then, use a + /// RAJA::forall template with the same execution policy to /// determine which points are visible. /// num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif @@ -248,13 +255,20 @@ int checkResult(int* visible, int* visible_ref, int len) int num_visible = 0; bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && visible[i] != visible_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && visible[i] != visible_ref[i]) + { + correct = false; + } num_visible += visible[i]; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } @@ -268,7 +282,8 @@ template void printArray(T* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp index 12348816a1..5da99b7fe2 100644 --- a/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp +++ b/exercises/tutorial_halfday/ex5_line-of-sight_solution.cpp @@ -93,52 +93,59 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) int* visible = memoryManager::allocate(N); int* visible_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { - dist[i] = static_cast(i+1); - double alt_fact = alt_max * ( (i+1) % 5 == 0 ? i*10 : i+1 ); - alt[i] = alt_fact * - static_cast( rand() ) / static_cast( RAND_MAX ); + for (int i = 0; i < N; ++i) + { + dist[i] = static_cast(i + 1); + double alt_fact = alt_max * ((i + 1) % 5 == 0 ? i * 10 : i + 1); + alt[i] = + alt_fact * static_cast(rand()) / static_cast(RAND_MAX); } // // Set angle array // - for (int i = 0; i < N; ++i) { - ang[i] = atan2( alt[i], dist[i] ); // set angle in radians + for (int i = 0; i < N; ++i) + { + ang[i] = atan2(alt[i], dist[i]); // set angle in radians } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style sequential line-of-sight algorithm...\n"; std::memset(visible_ref, 0, N * sizeof(int)); ang_max[0] = ang[0]; - for (int i = 1; i < N; ++i) { - ang_max[i] = std::max(ang[i], ang_max[i-1]); + for (int i = 1; i < N; ++i) + { + ang_max[i] = std::max(ang[i], ang_max[i - 1]); } int num_visible = 0; - for (int i = 0; i < N; ++i) { - if ( ang[i] >= ang_max[i] ) { - visible_ref[i] = 1; - num_visible++; - } else { - visible_ref[i] = 0; - } + for (int i = 0; i < N; ++i) + { + if (ang[i] >= ang_max[i]) + { + visible_ref[i] = 1; + num_visible++; + } + else + { + visible_ref[i] = 0; + } } std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible_ref, N); + // printArray(visible_ref, N); -//----------------------------------------------------------------------------// -// RAJA sequential variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA sequential variant + //----------------------------------------------------------------------------// std::cout << "\n\n Running RAJA sequential line-of-sight algorithm...\n"; @@ -148,27 +155,30 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL1 = RAJA::seq_exec; - RAJA::inclusive_scan< EXEC_POL1 >(RAJA::make_span(ang, N), - RAJA::make_span(ang_max, N), - RAJA::operators::maximum{} ); + RAJA::inclusive_scan(RAJA::make_span(ang, N), + RAJA::make_span(ang_max, N), + RAJA::operators::maximum{}); - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) { - if ( ang[i] >= ang_max[i] ) { - visible[i] = 1; - } else { - visible[i] = 0; + RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { + if (ang[i] >= ang_max[i]) + { + visible[i] = 1; + } + else + { + visible[i] = 0; } }); num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); -//----------------------------------------------------------------------------// -// RAJA OpenMP multithreading variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP multithreading variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -180,28 +190,31 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL2 = RAJA::omp_parallel_for_exec; - RAJA::inclusive_scan< EXEC_POL2 >(RAJA::make_span(ang, N), - RAJA::make_span(ang_max, N), - RAJA::operators::maximum{} ); + RAJA::inclusive_scan(RAJA::make_span(ang, N), + RAJA::make_span(ang_max, N), + RAJA::operators::maximum{}); - RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=] (int i) { - if ( ang[i] >= ang_max[i] ) { - visible[i] = 1; - } else { - visible[i] = 0; + RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { + if (ang[i] >= ang_max[i]) + { + visible[i] = 1; + } + else + { + visible[i] = 0; } }); num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA variant -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA variant + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -213,21 +226,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) using EXEC_POL3 = RAJA::cuda_exec; - RAJA::inclusive_scan< EXEC_POL3 >(RAJA::make_span(ang, N), - RAJA::make_span(ang_max, N), - RAJA::operators::maximum{} ); + RAJA::inclusive_scan(RAJA::make_span(ang, N), + RAJA::make_span(ang_max, N), + RAJA::operators::maximum{}); - RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { - if ( ang[i] >= ang_max[i] ) { - visible[i] = 1; - } else { - visible[i] = 0; + RAJA::forall(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(int i) { + if (ang[i] >= ang_max[i]) + { + visible[i] = 1; + } + else + { + visible[i] = 0; } }); num_visible = checkResult(visible, visible_ref, N); std::cout << "\n\t num visible points = " << num_visible << "\n\n"; -//printArray(visible, N); + // printArray(visible, N); #endif @@ -255,13 +271,20 @@ int checkResult(int* visible, int* visible_ref, int len) int num_visible = 0; bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && visible[i] != visible_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && visible[i] != visible_ref[i]) + { + correct = false; + } num_visible += visible[i]; } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } @@ -275,7 +298,8 @@ template void printArray(T* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp index 4d29f7b3ae..9e3968d313 100644 --- a/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp +++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout.cpp @@ -15,7 +15,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #6: Offset layout stencil computation. + * EXERCISE #6: Offset layout stencil computation. * * In this exercise, you will use RAJA Layouts and Views to perform * a simple 5-point stencil computation on a 2-dimensional Cartesian mesh. @@ -26,23 +26,23 @@ * The five-cell stencil accumulates values in a cell from itself and * its four neighbors. Assuming the cells are indexed using (i,j) pairs on * the two dimensional mesh, the stencil computation looks like: - * + * * out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) + * in(i, j - 1) + in(i, j + 1) * * where 'in' is the input data array and 'out' is the result of - * the stencil computation. For simplicity, in the code examples, we refer - * to the index tuples used to access input array entries as C (center), + * the stencil computation. For simplicity, in the code examples, we refer + * to the index tuples used to access input array entries as C (center), * W (west), E (east), S (south), and N (north). * - * We assume that the input array has an entry for N x M interior mesh cells + * We assume that the input array has an entry for N x M interior mesh cells * plus a one cell wide halo region around the mesh interior; i.e., the size * of the input array is (N + 2) * (M + 2). The output array has an entry * for N x M interior mesh cells only, so its size is N * M. Note that since - * the arrays have different sizes, C-style indexing requires different + * the arrays have different sizes, C-style indexing requires different * offset values in the code for accessing a cell entry in each array. - * - * The input array is initialized so that the entry for each interior cell + * + * The input array is initialized so that the entry for each interior cell * is one and the entry for each halo cell is zero. So for the case where * N = 3 and M = 2, the input array looks like: * @@ -66,7 +66,7 @@ * | 3 | 4 | 3 | * ------------- * - * You can think about indexing into this mesh as illustrated in the + * You can think about indexing into this mesh as illustrated in the * following diagram: * * --------------------------------------------------- @@ -79,31 +79,31 @@ * | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | * --------------------------------------------------- * - * Notably (0, 0) corresponds to the bottom left corner of the interior - * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom + * Notably (0, 0) corresponds to the bottom left corner of the interior + * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom * left corner of the halo region, which extends to (3, 2). * - * This file contains two C-style sequential implementations of stencil - * computation. One (Part a) has column indexing as stride-1 with the outer - * loop traversing the rows ('i' loop variable) and the inner loop traversing - * the columns ('j' loop variable). The other (Part B) has row indexing as - * stride-1 and reverses the order of the loops. This shows that a C-style - * implementation requires two different implementations, one for each loop - * order, since the array offset arithmetic is different in the two cases. - * Where indicated by comments, you will fill in versions using - * two-dimensional RAJA Views with offset layouts. One loop ordering requires - * permutations, while the other does not. If done properly, you will see - * that both RAJA versions have identical inner loop bodies, which is not the + * This file contains two C-style sequential implementations of stencil + * computation. One (Part a) has column indexing as stride-1 with the outer + * loop traversing the rows ('i' loop variable) and the inner loop traversing + * the columns ('j' loop variable). The other (Part B) has row indexing as + * stride-1 and reverses the order of the loops. This shows that a C-style + * implementation requires two different implementations, one for each loop + * order, since the array offset arithmetic is different in the two cases. + * Where indicated by comments, you will fill in versions using + * two-dimensional RAJA Views with offset layouts. One loop ordering requires + * permutations, while the other does not. If done properly, you will see + * that both RAJA versions have identical inner loop bodies, which is not the * case for the C-style variants. * - * Note that you will use the same for-loop patterns as the C-style loops. + * Note that you will use the same for-loop patterns as the C-style loops. * In a later exercise, we will show you how to use RAJA's nested loop - * support, which allows you to write both RAJA variants with identical + * support, which allows you to write both RAJA variants with identical * source code. * * RAJA features you will use: * - Offset-layouts and RAJA Views - * + * * Since this exercise is done on a CPU only, we use C++ new and delete * operators to allocate and deallocate the arrays we will use. */ @@ -111,14 +111,14 @@ // // Functions for printing and checking results // -// For array printing, 'stride1dim' indicates which mesh dimenstride is -// stride-1 (Rows indicates each row is stride-1, +// For array printing, 'stride1dim' indicates which mesh dimenstride is +// stride-1 (Rows indicates each row is stride-1, // Columns indicates each column is stride-1). // enum class Stride1 { - Rows, - Columns + Rows, + Columns }; void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim); void checkResult(int* A, int* A_ref, int Ntot); @@ -128,73 +128,76 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #6: Offset layout stencil computation...\n"; -// -// Define number of rows and columns of cells in the 2D mesh. -// - const int Nr_int = 5; + // + // Define number of rows and columns of cells in the 2D mesh. + // + const int Nr_int = 5; const int Nc_int = 8; - const int Nr_tot = Nr_int + 2; + const int Nr_tot = Nr_int + 2; const int Nc_tot = Nc_int + 2; - + const int int_cells = Nr_int * Nc_int; - const int tot_cells = Nr_tot * Nc_tot; + const int tot_cells = Nr_tot * Nc_tot; -// -// Allocate and initialize input array -// + // + // Allocate and initialize input array + // int* B = memoryManager::allocate(tot_cells * sizeof(int)); int* A = memoryManager::allocate(int_cells * sizeof(int)); int* A_ref = memoryManager::allocate(int_cells * sizeof(int)); -//----------------------------------------------------------------------------// -// Part A: -// -// Variant of stencil computation with column indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Part A: + // + // Variant of stencil computation with column indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that j is the stride-1 index. -// - for (int i = 1; i <= Nc_int; ++i) { - for (int j = 1; j <= Nr_int; ++j) { + // + // We assume that for each cell id (i,j) that j is the stride-1 index. + // + for (int i = 1; i <= Nc_int; ++i) + { + for (int j = 1; j <= Nr_int; ++j) + { int idx = j + Nr_tot * i; B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { int idx_out = j + Nr_int * i; int idx_in = (j + 1) + Nr_tot * (i + 1); - A_ref[idx_out] = B[idx_in] + // C - B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E - B[idx_in - 1] + B[idx_in + 1]; // S, N - + A_ref[idx_out] = B[idx_in] + // C + B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E + B[idx_in - 1] + B[idx_in + 1]; // S, N } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (no permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (no permutation). + //----------------------------------------------------------------------------// std::cout << "\n\n Running stencil computation with RAJA Views...\n"; @@ -203,114 +206,120 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE (Part A): + /// EXERCISE (Part A): /// - /// Fill in the stencil computation below where you use RAJA::View + /// Fill in the stencil computation below where you use RAJA::View /// objects for accessing entries in the A and B arrays. You will use /// a RAJA::OffsetLayout for the B array and a RAJA::Layout for the - /// A array. The B array access requires an offset since the loops - // iterate over the interior (i, j) indices. + /// A array. The B array access requires an offset since the loops + // iterate over the interior (i, j) indices. /// - /// For this part (A) of the exercise, the column (j-loop) indexing + /// For this part (A) of the exercise, the column (j-loop) indexing /// has stride 1. /// - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { // fill in the loop body - } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Part B: -// -// Variant of stencil computation with row indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Part B: + // + // Variant of stencil computation with row indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that i is the stride-1 index. -// - for (int j = 1; j <= Nr_int; ++j) { - for (int i = 1; i <= Nc_int; ++i) { + // + // We assume that for each cell id (i,j) that i is the stride-1 index. + // + for (int j = 1; j <= Nr_int; ++j) + { + for (int i = 1; i <= Nc_int; ++i) + { int idx = i + Nc_tot * j; B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { int idx_out = i + Nc_int * j; int idx_in = (i + 1) + Nc_tot * (j + 1); - A_ref[idx_out] = B[idx_in] + // C - B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N - B[idx_in - 1] + B[idx_in + 1]; // W, E - + A_ref[idx_out] = B[idx_in] + // C + B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N + B[idx_in - 1] + B[idx_in + 1]; // W, E } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (with permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (with permutation). + //----------------------------------------------------------------------------// - std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n"; + std::cout << "\n\n Running stencil computation with RAJA Views " + "(permuted)...\n"; std::memset(A, 0, int_cells * sizeof(int)); /// /// TODO... /// - /// EXERCISE (Part B): + /// EXERCISE (Part B): /// - /// Fill in the stencil computation below where you use RAJA::View + /// Fill in the stencil computation below where you use RAJA::View /// objects for accessing entries in the A and B arrays. You will use /// a RAJA::OffsetLayout for the B array and a RAJA::Layout for the - /// A array. The B array access requires an offset since the loops + /// A array. The B array access requires an offset since the loops // iterate over the interior (i, j) indices. /// - /// For this part (A) of the exercise, the row (i-loop) indexing - /// has stride 1. Thus, layouts for the A and B arrays require + /// For this part (A) of the exercise, the row (i-loop) indexing + /// has stride 1. Thus, layouts for the A and B arrays require /// the same permutation. /// - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { // fill in the loop body - } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(B); memoryManager::deallocate(A); memoryManager::deallocate(A_ref); @@ -321,19 +330,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // -// For array printing, 'stride1dim' indicates which mesh dimenstride is -// stride-1 (0 indicates each row is stride-1, +// For array printing, 'stride1dim' indicates which mesh dimenstride is +// stride-1 (0 indicates each row is stride-1, // 1 indicates each column is stride-1). // void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim) { std::cout << std::endl; - for (int j = 0; j < Nrows; ++j) { - for (int i = 0; i < Ncols; ++i) { + for (int j = 0; j < Nrows; ++j) + { + for (int i = 0; i < Ncols; ++i) + { int idx = 0; - if ( stride1dim == Stride1::Columns ) { + if (stride1dim == Stride1::Columns) + { idx = j + Nrows * i; - } else { + } + else + { idx = i + Ncols * j; } std::cout << v[idx] << " "; @@ -350,15 +364,20 @@ void checkResult(int* A, int* A_ref, int Ntot) { bool pass = true; - for (int i = 0; i < Ntot; ++i) { - if ( pass && (A[i] != A_ref[i]) ) { + for (int i = 0; i < Ntot; ++i) + { + if (pass && (A[i] != A_ref[i])) + { pass = false; } } - if (pass) { + if (pass) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp index 51aad20dae..e323c3f4d3 100644 --- a/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp +++ b/exercises/tutorial_halfday/ex6_stencil-offset-layout_solution.cpp @@ -15,7 +15,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #6: Offset layout stencil computation. + * EXERCISE #6: Offset layout stencil computation. * * In this exercise, you will use RAJA Layouts and Views to perform * a simple 5-point stencil computation on a 2-dimensional Cartesian mesh. @@ -26,23 +26,23 @@ * The five-cell stencil accumulates values in a cell from itself and * its four neighbors. Assuming the cells are indexed using (i,j) pairs on * the two dimensional mesh, the stencil computation looks like: - * + * * out(i, j) = in(i, j) + in(i - 1, j) + in(i + 1, j) + * in(i, j - 1) + in(i, j + 1) * * where 'in' is the input data array and 'out' is the result of - * the stencil computation. For simplicity, in the code examples, we refer - * to the index tuples used to access input array entries as C (center), + * the stencil computation. For simplicity, in the code examples, we refer + * to the index tuples used to access input array entries as C (center), * W (west), E (east), S (south), and N (north). * - * We assume that the input array has an entry for N x M interior mesh cells + * We assume that the input array has an entry for N x M interior mesh cells * plus a one cell wide halo region around the mesh interior; i.e., the size * of the input array is (N + 2) * (M + 2). The output array has an entry * for N x M interior mesh cells only, so its size is N * M. Note that since - * the arrays have different sizes, C-style indexing requires different + * the arrays have different sizes, C-style indexing requires different * offset values in the code for accessing a cell entry in each array. - * - * The input array is initialized so that the entry for each interior cell + * + * The input array is initialized so that the entry for each interior cell * is one and the entry for each halo cell is zero. So for the case where * N = 3 and M = 2, the input array looks like: * @@ -66,7 +66,7 @@ * | 3 | 4 | 3 | * ------------- * - * You can think about indexing into this mesh as illustrated in the + * You can think about indexing into this mesh as illustrated in the * following diagram: * * --------------------------------------------------- @@ -79,31 +79,31 @@ * | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | * --------------------------------------------------- * - * Notably (0, 0) corresponds to the bottom left corner of the interior - * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom + * Notably (0, 0) corresponds to the bottom left corner of the interior + * region, which extends to (2, 1), and (-1, -1) corresponds to the bottom * left corner of the halo region, which extends to (3, 2). * - * This file contains two C-style sequential implementations of stencil - * computation. One has column indexing as stride-1 with the outer loop - * traversing the rows ('i' loop variable) and the inner loop traversing the + * This file contains two C-style sequential implementations of stencil + * computation. One has column indexing as stride-1 with the outer loop + * traversing the rows ('i' loop variable) and the inner loop traversing the * columns ('j' loop variable). The other has row indexing as stride-1 and - * reverses the order of the loops. This shows that a C-style implementation + * reverses the order of the loops. This shows that a C-style implementation * requires two different implementations, one for each loop order, since the - * array offset arithmetic is different in the two cases. Where indicated + * array offset arithmetic is different in the two cases. Where indicated * by comments, you will fill in versions using two-dimensional RAJA Views * with offset layouts. One loop ordering requires permutations, while the * other does not. If done properly, you will see that both RAJA versions * have identical inner loop bodies, which is not the case for the C-style * variants. * - * Note that you will use the same for-loop patterns as the C-style loops. + * Note that you will use the same for-loop patterns as the C-style loops. * In a later exercise, we will show you how to use RAJA's nested loop - * support, which allows you to write both RAJA variants with identical + * support, which allows you to write both RAJA variants with identical * source code. * * RAJA features you will use: * - Offset-layouts and RAJA Views - * + * * Since this exercise is done on a CPU only, we use C++ new and delete * operators to allocate and deallocate the arrays we will use. */ @@ -111,14 +111,14 @@ // // Functions for printing and checking results // -// For array printing, 'stride1dim' indicates which mesh dimenstride is -// stride-1 (Rows indicates each row is stride-1, +// For array printing, 'stride1dim' indicates which mesh dimenstride is +// stride-1 (Rows indicates each row is stride-1, // Columns indicates each column is stride-1). // enum class Stride1 { - Rows, - Columns + Rows, + Columns }; void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim); void checkResult(int* A, int* A_ref, int Ntot); @@ -128,73 +128,76 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n\nExercise #6: Offset layout stencil computation...\n"; -// -// Define number of rows and columns of cells in the 2D mesh. -// + // + // Define number of rows and columns of cells in the 2D mesh. + // const int DIM = 2; - const int Nr_int = 5; + const int Nr_int = 5; const int Nc_int = 8; - const int Nr_tot = Nr_int + 2; + const int Nr_tot = Nr_int + 2; const int Nc_tot = Nc_int + 2; - + const int int_cells = Nr_int * Nc_int; - const int tot_cells = Nr_tot * Nc_tot; + const int tot_cells = Nr_tot * Nc_tot; -// -// Allocate and initialize input array -// + // + // Allocate and initialize input array + // int* B = memoryManager::allocate(tot_cells * sizeof(int)); int* A = memoryManager::allocate(int_cells * sizeof(int)); int* A_ref = memoryManager::allocate(int_cells * sizeof(int)); -//----------------------------------------------------------------------------// -// First variant of stencil computation with column indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // First variant of stencil computation with column indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that j is the stride-1 index. -// - for (int i = 1; i <= Nc_int; ++i) { - for (int j = 1; j <= Nr_int; ++j) { + // + // We assume that for each cell id (i,j) that j is the stride-1 index. + // + for (int i = 1; i <= Nc_int; ++i) + { + for (int j = 1; j <= Nr_int; ++j) + { int idx = j + Nr_tot * i; B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Columns); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { int idx_out = j + Nr_int * i; int idx_in = (j + 1) + Nr_tot * (i + 1); - A_ref[idx_out] = B[idx_in] + // C - B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E - B[idx_in - 1] + B[idx_in + 1]; // S, N - + A_ref[idx_out] = B[idx_in] + // C + B[idx_in - Nr_tot] + B[idx_in + Nr_tot] + // W, E + B[idx_in - 1] + B[idx_in + 1]; // S, N } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (no permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (no permutation). + //----------------------------------------------------------------------------// std::cout << "\n\n Running stencil computation with RAJA Views...\n"; @@ -203,78 +206,83 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Create offset Layout and Views for data access. Note that only // the input array access requires an offset since the loops iterate over - // the interior (i, j) indices. We can use the default layout for the - // output array. Also, since the 'j' index (rightmost) is stride-1, + // the interior (i, j) indices. We can use the default layout for the + // output array. Also, since the 'j' index (rightmost) is stride-1, // we don't need a permutation for this case. // RAJA::OffsetLayout B_layout = - RAJA::make_offset_layout({{-1, -1}}, {{Nc_tot-1, Nr_tot-1}}); + RAJA::make_offset_layout({{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}}); RAJA::View> Bview(B, B_layout); RAJA::View> Aview(A, Nc_int, Nr_int); - for (int i = 0; i < Nc_int; ++i) { - for (int j = 0; j < Nr_int; ++j) { - - Aview(i, j) = Bview(i, j) + // C - Bview(i - 1, j) + Bview(i + 1, j) + // W, E - Bview(i, j - 1) + Bview(i, j + 1); // S, N + for (int i = 0; i < Nc_int; ++i) + { + for (int j = 0; j < Nr_int; ++j) + { + Aview(i, j) = Bview(i, j) + // C + Bview(i - 1, j) + Bview(i + 1, j) + // W, E + Bview(i, j - 1) + Bview(i, j + 1); // S, N } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Columns); -//----------------------------------------------------------------------------// -// Second variant of stencil computation with row indexing as stride-1. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Second variant of stencil computation with row indexing as stride-1. + //----------------------------------------------------------------------------// std::memset(B, 0, tot_cells * sizeof(int)); -// -// We assume that for each cell id (i,j) that i is the stride-1 index. -// - for (int j = 1; j <= Nr_int; ++j) { - for (int i = 1; i <= Nc_int; ++i) { + // + // We assume that for each cell id (i,j) that i is the stride-1 index. + // + for (int j = 1; j <= Nr_int; ++j) + { + for (int i = 1; i <= Nc_int; ++i) + { int idx = i + Nc_tot * j; B[idx] = 1; } } -//printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); + // printArrayOnMesh(B, Nr_tot, Nc_tot, Stride1::Rows); -//----------------------------------------------------------------------------// -// C-style stencil computation establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style stencil computation establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n\n Running C-style stencil computation (reference soln)...\n"; std::memset(A_ref, 0, int_cells * sizeof(int)); - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { int idx_out = i + Nc_int * j; int idx_in = (i + 1) + Nc_tot * (j + 1); - A_ref[idx_out] = B[idx_in] + // C - B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N - B[idx_in - 1] + B[idx_in + 1]; // W, E - + A_ref[idx_out] = B[idx_in] + // C + B[idx_in - Nc_tot] + B[idx_in + Nc_tot] + // S, N + B[idx_in - 1] + B[idx_in + 1]; // W, E } } -//printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A_ref, Nr_int, Nc_int, Stride1::Rows); -//----------------------------------------------------------------------------// -// Variant using RAJA Layouts and Views (with permutation). -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // Variant using RAJA Layouts and Views (with permutation). + //----------------------------------------------------------------------------// - std::cout << "\n\n Running stencil computation with RAJA Views (permuted)...\n"; + std::cout << "\n\n Running stencil computation with RAJA Views " + "(permuted)...\n"; std::memset(A, 0, int_cells * sizeof(int)); @@ -289,35 +297,35 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // application. // - std::array perm {{1, 0}}; // 'i' index (position zero0) - // is stride-1 + std::array perm{{1, 0}}; // 'i' index (position zero0) + // is stride-1 - RAJA::OffsetLayout pB_layout = - RAJA::make_permuted_offset_layout( {{-1, -1}}, {{Nc_tot-1, Nr_tot-1}}, - perm ); + RAJA::OffsetLayout pB_layout = RAJA::make_permuted_offset_layout( + {{-1, -1}}, {{Nc_tot - 1, Nr_tot - 1}}, perm); - RAJA::Layout pA_layout = - RAJA::make_permuted_layout( {{Nc_int, Nr_int}}, perm ); + RAJA::Layout pA_layout = + RAJA::make_permuted_layout({{Nc_int, Nr_int}}, perm); RAJA::View> pBview(B, pB_layout); RAJA::View> pAview(A, pA_layout); - for (int j = 0; j < Nr_int; ++j) { - for (int i = 0; i < Nc_int; ++i) { - - pAview(i, j) = pBview(i, j) + // C - pBview(i - 1, j) + pBview(i + 1, j) + // W, E - pBview(i, j - 1) + pBview(i, j + 1); // S, N + for (int j = 0; j < Nr_int; ++j) + { + for (int i = 0; i < Nc_int; ++i) + { + pAview(i, j) = pBview(i, j) + // C + pBview(i - 1, j) + pBview(i + 1, j) + // W, E + pBview(i, j - 1) + pBview(i, j + 1); // S, N } } checkResult(A, A_ref, int_cells); -//printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); + // printArrayOnMesh(A, Nr_int, Nc_int, Stride1::Rows); -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(B); memoryManager::deallocate(A); memoryManager::deallocate(A_ref); @@ -328,19 +336,24 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // -// For array printing, 'stride1dim' indicates which mesh dimenstride is -// stride-1 (0 indicates each row is stride-1, +// For array printing, 'stride1dim' indicates which mesh dimenstride is +// stride-1 (0 indicates each row is stride-1, // 1 indicates each column is stride-1). // void printArrayOnMesh(int* v, int Nrows, int Ncols, Stride1 stride1dim) { std::cout << std::endl; - for (int j = 0; j < Nrows; ++j) { - for (int i = 0; i < Ncols; ++i) { + for (int j = 0; j < Nrows; ++j) + { + for (int i = 0; i < Ncols; ++i) + { int idx = 0; - if ( stride1dim == Stride1::Columns ) { + if (stride1dim == Stride1::Columns) + { idx = j + Nrows * i; - } else { + } + else + { idx = i + Ncols * j; } std::cout << v[idx] << " "; @@ -357,15 +370,20 @@ void checkResult(int* A, int* A_ref, int Ntot) { bool pass = true; - for (int i = 0; i < Ntot; ++i) { - if ( pass && (A[i] != A_ref[i]) ) { + for (int i = 0; i < Ntot; ++i) + { + if (pass && (A[i] != A_ref[i])) + { pass = false; } } - if (pass) { + if (pass) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } diff --git a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp index d183c221fa..f9ac15ab9e 100644 --- a/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp +++ b/exercises/tutorial_halfday/ex8_tiled-matrix-transpose.cpp @@ -16,13 +16,13 @@ /* * EXERCISE #8: Tiled Matrix Transpose * - * In this exercise, you will use RAJA constructs to transpose a matrix + * In this exercise, you will use RAJA constructs to transpose a matrix * using a loop tiling algorithm. An input matrix A of dimension N_r x N_c * is provided. You will fill in the entries of the transpose matrix At. * * This file contains a C-style variant of the sequential matrix transpose. * You will complete implementations of multiple RAJA variants by filling - * in missing elements of RAJA kernel API execution policies as well as the + * in missing elements of RAJA kernel API execution policies as well as the * RAJA kernel implementation for each. Variants you will complete include * sequential, OpenMP, and CUDA execution. * @@ -52,7 +52,7 @@ void checkResult(RAJA::View> Atview, int N_r, int N_c); template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n"; @@ -66,8 +66,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -80,9 +80,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Construct a permuted layout for At so that the column index has stride 1 // - std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + std::array perm{{1, 0}}; + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -97,14 +96,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -112,8 +113,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries @@ -121,29 +124,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that output matrix data access // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { - int col = bx * TILE_SZ + tcol; // Matrix column index - int row = by * TILE_SZ + trow; // Matrix row index + int col = bx * TILE_SZ + tcol; // Matrix column index + int row = by * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // - // The following RAJA variants will use the RAJA::kernel method to + // The following RAJA variants will use the RAJA::kernel method to // perform the matrix transpose operation. // // Here, we define RAJA range segments to establish the iteration spaces. @@ -152,14 +157,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // global iteration number. // -// Note: this needs to be turned on for other back-ends when working the +// Note: this needs to be turned on for other back-ends when working the // exercises (sequential, CUDA, etc.) #if defined(RAJA_ENABLE_OPENMP) RAJA::RangeSegment row_Range(0, N_r); RAJA::RangeSegment col_Range(0, N_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -199,7 +204,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top " + "inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -238,9 +244,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed " + "inner loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -249,27 +256,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // into a single OpenMP loop enabling parallel loads/reads // to/from the tile. // - using KERNEL_EXEC_POL_OMP2 = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::Collapse, - RAJA::statement::Lambda<0> - > //closes collapse - > // closes Tile 0 - > // closes Tile 1 - >; // closes policy list + using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile<0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::Collapse< + RAJA::omp_parallel_collapse_exec, + RAJA::ArgList<0, 1>, + RAJA::statement::Lambda<0>> // closes collapse + > // closes Tile 0 + > // closes Tile 1 + >; // closes policy list RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -334,16 +338,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match &= false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -355,11 +365,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout<> Atview, int N_r, int N_c); template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #8: RAJA Tiled Matrix Transpose...\n"; @@ -64,8 +64,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of tiled matrix transpose, we @@ -78,9 +78,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Construct a permuted layout for At so that the column index has stride 1 // - std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + std::array perm{{1, 0}}; + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -95,14 +94,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } - //printResult(Aview, N_r, N_c); + // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -110,38 +111,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int by = 0; by < outer_Dimr; ++by) { - for (int bx = 0; bx < outer_Dimc; ++bx) { + for (int by = 0; by < outer_Dimr; ++by) + { + for (int bx = 0; bx < outer_Dimc; ++bx) + { // // (1) Loops to iterate over tile entries // // Note: loops are ordered so that output matrix data access - // is stride-1. + // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { - int col = bx * TILE_SZ + tcol; // Matrix column index - int row = by * TILE_SZ + trow; // Matrix row index + int col = bx * TILE_SZ + tcol; // Matrix column index + int row = by * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Aview(row, col); } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // - // The following RAJA variants will use the RAJA::kernel method to + // The following RAJA variants will use the RAJA::kernel method to // perform the matrix transpose operation. // // Here, we define RAJA range segments to establish the iteration spaces. @@ -152,7 +157,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::RangeSegment row_Range(0, N_r); RAJA::RangeSegment col_Range(0, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential tiled matrix transpose ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -162,32 +167,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // tile_fixed corresponds to the dimension size of the tile. // - using KERNEL_EXEC_POL_SEQ = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - Atview(col, row) = Aview(row, col); - }); + using KERNEL_EXEC_POL_SEQ = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::For< + 1, + RAJA::seq_exec, + RAJA::statement:: + For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>; + + RAJA::kernel( + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top inner loop...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - parallel top " + "inner loop...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -196,35 +200,31 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // one of the inner loops. // - using KERNEL_EXEC_POL_OMP = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - > - > - >; + using KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::For< + 1, + RAJA::omp_parallel_for_exec, + RAJA::statement:: + For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>>>; RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed inner loops...\n"; + std::cout << "\n Running RAJA openmp tiled matrix transpose - collapsed " + "inner loops...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -234,27 +234,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // to/from the tile. // - using KERNEL_EXEC_POL_OMP2 = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::Collapse, - RAJA::statement::Lambda<0> - > //closes collapse - > // closes Tile 0 - > // closes Tile 1 - >; // closes policy list + using KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile<0, + RAJA::tile_fixed, + RAJA::seq_exec, + RAJA::statement::Collapse< + RAJA::omp_parallel_collapse_exec, + RAJA::ArgList<0, 1>, + RAJA::statement::Lambda<0>> // closes collapse + > // closes Tile 0 + > // closes Tile 1 + >; // closes policy list RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -267,29 +264,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using KERNEL_EXEC_POL_CUDA = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::cuda_block_x_loop, - RAJA::statement::For<1, RAJA::cuda_thread_y_direct, - RAJA::statement::For<0, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<0> - > - > - > - > - > - >; - - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - - Atview(col, row) = Aview(row, col); - - }); + RAJA::KernelPolicy, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + RAJA::statement::For< + 1, + RAJA::cuda_thread_y_direct, + RAJA::statement::For<0, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0>>>>>>>; + + RAJA::kernel(RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE(int col, int row) { + Atview(col, row) = Aview(row, col); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -314,16 +307,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match &= false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -335,11 +334,13 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) // << std::endl; - std::cout< void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n"; @@ -71,8 +71,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -85,9 +85,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Construct a permuted layout for At so that the column index has stride 1 // - std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + std::array perm{{1, 0}}; + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -102,14 +101,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of local array matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -117,8 +118,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int brow = 0; brow < outer_Dimr; ++brow) { - for (int bcol = 0; bcol < outer_Dimc; ++bcol) { + for (int brow = 0; brow < outer_Dimr; ++brow) + { + for (int bcol = 0; bcol < outer_Dimc; ++bcol) + { // Stack-allocated local array for data on a tile int Tile[TILE_SZ][TILE_SZ]; @@ -129,14 +132,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { - int col = bcol * TILE_SZ + tcol; // Matrix column index - int row = brow * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[trow][tcol] = Aview(row, col); } } @@ -148,25 +154,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { - for (int trow = 0; trow < TILE_SZ; ++trow) { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { + for (int trow = 0; trow < TILE_SZ; ++trow) + { - int col = bcol * TILE_SZ + tcol; // Matrix column index - int row = brow * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[trow][tcol]; } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // The following RAJA variants will use the RAJA::kernel method to @@ -177,7 +185,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // tile_fixed statements. Iterations inside a RAJA loop is given by their // global iteration number. // -#if 0 // needed for exercises, but if-def'd out to quiet compiler warnings. +#if 0 // needed for exercises, but if-def'd out to quiet compiler warnings. RAJA::RangeSegment row_Range(0, N_r); RAJA::RangeSegment col_Range(0, N_c); #endif @@ -190,7 +198,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + RAJA::LocalArray, RAJA::SizeList>; // **NOTE** The LocalArray is created here, but it's memory is not yet // allocated. This is done when the 'InitLocalMem' statement @@ -199,7 +207,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) TILE_MEM RAJA_Tile; -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -425,16 +433,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match &= false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -446,8 +460,10 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) << std::endl; } diff --git a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp index 1900bf1157..9603820403 100644 --- a/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp +++ b/exercises/tutorial_halfday/ex9_matrix-transpose-local-array_solution.cpp @@ -17,10 +17,10 @@ * EXERCISE #9: Matrix Transpose with Local Array * * In this exercise, you will use RAJA constructs to transpose a matrix - * using a loop tiling algorithm similar to exercise 8. However, this + * using a loop tiling algorithm similar to exercise 8. However, this * exercise is different in that you will use a local array to write - * to and read from as each matrix tile is transposed. An input matrix - * A of dimension N_r x N_c is provided. You will fill in the entries + * to and read from as each matrix tile is transposed. An input matrix + * A of dimension N_r x N_c is provided. You will fill in the entries * of the transpose matrix At. * * This file contains a C-style variant of the sequential matrix transpose. @@ -57,7 +57,7 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise #9: RAJA local array matrix transpose...\n"; @@ -71,8 +71,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate matrix data // - int *A = memoryManager::allocate(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); + int* A = memoryManager::allocate(N_r * N_c); + int* At = memoryManager::allocate(N_r * N_c); // // In the following implementations of matrix transpose, we @@ -85,9 +85,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Construct a permuted layout for At so that the column index has stride 1 // - std::array perm {{1, 0}}; - RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout( {{N_c, N_r}}, - perm ); + std::array perm{{1, 0}}; + RAJA::Layout<2> perm_layout = RAJA::make_permuted_layout({{N_c, N_r}}, perm); RAJA::View> Atview(At, perm_layout); // @@ -102,14 +101,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Initialize matrix data // - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { Aview(row, col) = col; } } // printResult(Aview, N_r, N_c); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running C-version of local array matrix transpose...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -117,8 +118,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (0) Outer loops to iterate over tiles // - for (int brow = 0; brow < outer_Dimr; ++brow) { - for (int bcol = 0; bcol < outer_Dimc; ++bcol) { + for (int brow = 0; brow < outer_Dimr; ++brow) + { + for (int bcol = 0; bcol < outer_Dimc; ++bcol) + { // Stack-allocated local array for data on a tile int Tile[TILE_SZ][TILE_SZ]; @@ -129,14 +132,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loops are ordered so that input matrix data access // is stride-1. // - for (int trow = 0; trow < TILE_SZ; ++trow) { - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { + for (int trow = 0; trow < TILE_SZ; ++trow) + { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { - int col = bcol * TILE_SZ + tcol; // Matrix column index - int row = brow * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Tile[trow][tcol] = Aview(row, col); } } @@ -148,25 +154,27 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tcol = 0; tcol < TILE_SZ; ++tcol) { - for (int trow = 0; trow < TILE_SZ; ++trow) { + for (int tcol = 0; tcol < TILE_SZ; ++tcol) + { + for (int trow = 0; trow < TILE_SZ; ++trow) + { - int col = bcol * TILE_SZ + tcol; // Matrix column index - int row = brow * TILE_SZ + trow; // Matrix row index + int col = bcol * TILE_SZ + tcol; // Matrix column index + int row = brow * TILE_SZ + trow; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_r && col < N_c) + { Atview(col, row) = Tile[trow][tcol]; } } } - } } checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // // The following RAJA variants will use the RAJA::kernel method to @@ -188,7 +196,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // using TILE_MEM = - RAJA::LocalArray, RAJA::SizeList>; + RAJA::LocalArray, RAJA::SizeList>; // **NOTE** The LocalArray is created here, but it's memory is not yet // allocated. This is done when the 'InitLocalMem' statement @@ -197,55 +205,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) TILE_MEM RAJA_Tile; -//--------------------------------------------------------------------------// + //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); - using SEQ_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::seq_exec, - - RAJA::statement::InitLocalMem, - - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, - RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, - RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - >, - - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, - RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, - RAJA::seq_exec, - RAJA::statement::Lambda<1> - > - > - - > - > - > - >; + using SEQ_EXEC_POL = RAJA::KernelPolicy, + RAJA::seq_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + + RAJA::statement::InitLocalMem< + RAJA::cpu_tile_mem, + RAJA::ParamList<2>, + + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::ForICount<0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::Lambda<0>>>, + + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::ForICount<1, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::Lambda<1>>> + + >>>>; + + RAJA::kernel_param( + RAJA::make_tuple(col_Range, row_Range), - RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), - - RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - - [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { - - RAJA_Tile(trow, tcol) = Aview(row, col); - - }, - - [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { + RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - Atview(col, row) = RAJA_Tile(trow, tcol); + [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { + RAJA_Tile(trow, tcol) = Aview(row, col); + }, - }); + [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { + Atview(col, row) = RAJA_Tile(trow, tcol); + }); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -257,49 +265,48 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); - using OPENMP_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::omp_parallel_for_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::seq_exec, - - RAJA::statement::InitLocalMem, - - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, - RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, - RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - >, - - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, - RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, - RAJA::seq_exec, - RAJA::statement::Lambda<1> - > - > - > - > - > - >; - - RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), + using OPENMP_EXEC_POL = RAJA::KernelPolicy, + RAJA::omp_parallel_for_exec, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::seq_exec, + + RAJA::statement::InitLocalMem< + RAJA::cpu_tile_mem, + RAJA::ParamList<2>, + + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::ForICount<0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::Lambda<0>>>, + + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<0>, + RAJA::seq_exec, + RAJA::statement::ForICount<1, + RAJA::statement::Param<1>, + RAJA::seq_exec, + RAJA::statement::Lambda<1>>>>>>>; + + RAJA::kernel_param( + RAJA::make_tuple(col_Range, row_Range), RAJA::make_tuple((int)0, (int)0, RAJA_Tile), [=](int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { - RAJA_Tile(trow, tcol) = Aview(row, col); - }, [=](int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { - Atview(col, row) = RAJA_Tile(trow, tcol); - }); checkResult(Atview, N_c, N_r); @@ -315,55 +322,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); using CUDA_EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::Tile<1, RAJA::tile_fixed, - RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, - RAJA::cuda_block_x_loop, - - RAJA::statement::InitLocalMem, - - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, - RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, - RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<0> - > - >, - - RAJA::statement::CudaSyncThreads, - - RAJA::statement::ForICount<0, RAJA::statement::Param<0>, - RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<1>, - RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<1> - > - >, - - RAJA::statement::CudaSyncThreads - > - > - > - > - >; - - - RAJA::kernel_param( RAJA::make_tuple(col_Range, row_Range), + RAJA::KernelPolicy, + RAJA::cuda_block_y_loop, + RAJA::statement::Tile< + 0, + RAJA::tile_fixed, + RAJA::cuda_block_x_loop, + + RAJA::statement::InitLocalMem< + RAJA::cuda_shared_mem, + RAJA::ParamList<2>, + + RAJA::statement::ForICount< + 1, + RAJA::statement::Param<1>, + RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<0, + RAJA::statement::Param<0>, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0>>>, + + RAJA::statement::CudaSyncThreads, + + RAJA::statement::ForICount< + 0, + RAJA::statement::Param<0>, + RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<1, + RAJA::statement::Param<1>, + RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<1>>>, + + RAJA::statement::CudaSyncThreads>>>>>; + + + RAJA::kernel_param( + RAJA::make_tuple(col_Range, row_Range), RAJA::make_tuple((int)0, (int)0, RAJA_Tile), - [=] RAJA_DEVICE (int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { - + [=] RAJA_DEVICE( + int col, int row, int tcol, int trow, TILE_MEM& RAJA_Tile) { RAJA_Tile(trow, tcol) = Aview(row, col); - }, - [=] RAJA_DEVICE(int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { - + [=] RAJA_DEVICE( + int col, int row, int tcol, int trow, TILE_MEM RAJA_Tile) { Atview(col, row) = RAJA_Tile(trow, tcol); - }); checkResult(Atview, N_c, N_r); @@ -391,16 +398,22 @@ template void checkResult(RAJA::View> Atview, int N_r, int N_c) { bool match = true; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { - if (Atview(row, col) != row) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { + if (Atview(row, col) != row) + { match &= false; } } } - if (match) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -412,8 +425,10 @@ template void printResult(RAJA::View> Atview, int N_r, int N_c) { std::cout << std::endl; - for (int row = 0; row < N_r; ++row) { - for (int col = 0; col < N_c; ++col) { + for (int row = 0; row < N_r; ++row) + { + for (int col = 0; col < N_c; ++col) + { std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) << std::endl; } diff --git a/exercises/tutorial_halfday/memoryManager.hpp b/exercises/tutorial_halfday/memoryManager.hpp index 83fb8cb3bb..c563033f9c 100644 --- a/exercises/tutorial_halfday/memoryManager.hpp +++ b/exercises/tutorial_halfday/memoryManager.hpp @@ -28,12 +28,12 @@ namespace memoryManager { template -T *allocate(RAJA::Index_type size) +T* allocate(RAJA::Index_type size) { - T *ptr; + T* ptr; #if defined(RAJA_ENABLE_CUDA) cudaErrchk( - cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); + cudaMallocManaged((void**)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); #else ptr = new T[size]; #endif @@ -41,9 +41,10 @@ T *allocate(RAJA::Index_type size) } template -void deallocate(T *&ptr) +void deallocate(T*& ptr) { - if (ptr) { + if (ptr) + { #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaFree(ptr)); #else @@ -54,31 +55,32 @@ void deallocate(T *&ptr) } #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - template - T *allocate_gpu(RAJA::Index_type size) - { - T *ptr; +template +T* allocate_gpu(RAJA::Index_type size) +{ + T* ptr; #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size)); + cudaErrchk(cudaMalloc((void**)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); + hipErrchk(hipMalloc((void**)&ptr, sizeof(T) * size)); #endif - return ptr; - } + return ptr; +} - template - void deallocate_gpu(T *&ptr) +template +void deallocate_gpu(T*& ptr) +{ + if (ptr) { - if (ptr) { #if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaFree(ptr)); + cudaErrchk(cudaFree(ptr)); #elif defined(RAJA_ENABLE_HIP) - hipErrchk(hipFree(ptr)); + hipErrchk(hipFree(ptr)); #endif - ptr = nullptr; - } + ptr = nullptr; } +} #endif -}; // namespace memoryManager +}; // namespace memoryManager #endif diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp index dbe5260f6d..4528d7a8c9 100644 --- a/exercises/vector-addition.cpp +++ b/exercises/vector-addition.cpp @@ -16,7 +16,7 @@ /* * Vector Addition Exercise * - * In this exercise, you will compute c = a + b, where a, b, c are + * In this exercise, you will compute c = a + b, where a, b, c are * integer vectors. * * This file contains sequential and OpenMP variants of the vector addition @@ -24,7 +24,7 @@ * plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA * compiler, in empty code sections indicated by comments. * - * The exercise shows you how to use RAJA in its simplest form and + * The exercise shows you how to use RAJA in its simplest form and * illustrates similarities between a C-style for-loop and a RAJA forall loop. * * RAJA features you will use: @@ -32,75 +32,77 @@ * - Index range segment * - Execution policies * - * Note: if CUDA is enabled, CUDA unified memory is used. + * Note: if CUDA is enabled, CUDA unified memory is used. */ /* Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -//constexpr int CUDA_BLOCK_SIZE = 256; +// constexpr int CUDA_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_HIP) -//constexpr int HIP_BLOCK_SIZE = 256; +// constexpr int HIP_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_SYCL) -//constexpr int SYCL_BLOCK_SIZE = 256; +// constexpr int SYCL_BLOCK_SIZE = 256; #endif // // Functions for checking and printing arrays // -void checkResult(int* c, int* c_ref, int len); +void checkResult(int* c, int* c_ref, int len); void printArray(int* v, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA Vector Addition...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data to random numbers in [1, 10]. -// - int *a = memoryManager::allocate(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); - int *c_ref = memoryManager::allocate(N); + // + // Allocate and initialize vector data to random numbers in [1, 10]. + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); + int* c_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = rand() % 10 + 1; b[i] = rand() % 10 + 1; } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::memset(c_ref, 0, N * sizeof(int)); std::cout << "\n Running C-style sequential vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c_ref[i] = a[i] + b[i]; } // _cstyle_vector_add_end -//printArray(c_ref, N); + // printArray(c_ref, N); -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); @@ -110,25 +112,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// TODO... /// /// EXERCISE: Implement the vector addition kernel using a RAJA::forall - /// method and RAJA::seq_exec execution policy type. + /// method and RAJA::seq_exec execution policy type. /// /// NOTE: We've done this one for you to help you get started... /// // _rajaseq_vector_add_start - RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); // _rajaseq_vector_add_end checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::simd_exec policy attempts to force the compiler to generate SIMD -// vectorization optimizations. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::simd_exec policy attempts to force the compiler to generate SIMD + // vectorization optimizations. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); @@ -142,12 +143,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -155,21 +156,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style OpenMP vector addition...\n"; - #pragma omp parallel for - for (int i = 0; i < N; ++i) { +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_exec policy runs the loop in parallel using -// OpenMP multithreading. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_exec policy runs the loop in parallel using + // OpenMP multithreading. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -185,13 +187,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// checkResult(c, c_ref, N); -//printArray(c, N); +// printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -199,12 +201,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice )); - cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice)); + cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice)); /// /// TODO... @@ -213,53 +215,54 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// method and RAJA::cuda_exec execution policy type. /// /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a -// GPU device with 2 blocks per SM. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a + // GPU device with 2 blocks per SM. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); - std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; + std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector " + "addition...\n"; /// /// TODO... /// /// EXERCISE: Implement the vector addition kernel using a RAJA::forall - /// method and RAJA::cuda_exec execution policy type with + /// method and RAJA::cuda_exec execution policy type with /// arguments defining 2 blocks per SM and asynchronous execution. /// /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); +// printResult(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice)); /// /// TODO... @@ -268,29 +271,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// method and RAJA::hip_exec execution policy type. /// /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// - hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// -// RAJA::sycl_exec policy runs the loop as a SYCL kernel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::sycl_exec policy runs the loop as a SYCL kernel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int)); memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); @@ -302,24 +305,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// method and RAJA::hip_exec execution policy type. /// /// NOTE: You will have to uncomment 'SYCL_BLOCK_SIZE' near the - /// top of the file if you want to use it here. + /// top of the file if you want to use it here. /// memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -336,12 +339,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(int* c, int* c_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && c[i] != c_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && c[i] != c_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -352,9 +362,9 @@ void checkResult(int* c, int* c_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; } - diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp index 3bbc070731..5149d23d56 100644 --- a/exercises/vector-addition_solution.cpp +++ b/exercises/vector-addition_solution.cpp @@ -16,7 +16,7 @@ /* * Vector Addition Exercise * - * In this exercise, you will compute c = a + b, where a, b, c are + * In this exercise, you will compute c = a + b, where a, b, c are * integer vectors. * * This file contains sequential and OpenMP variants of the vector addition @@ -24,7 +24,7 @@ * plus a RAJA CUDA version if you have access to an NVIDIA GPU and a CUDA * compiler, in empty code sections indicated by comments. * - * The exercise shows you how to use RAJA in its simplest form and + * The exercise shows you how to use RAJA in its simplest form and * illustrates similarities between a C-style for-loop and a RAJA forall loop. * * RAJA features you will use: @@ -32,7 +32,7 @@ * - Index range segment * - Execution policies * - * Note: if CUDA is enabled, CUDA unified memory is used. + * Note: if CUDA is enabled, CUDA unified memory is used. */ /* @@ -53,93 +53,89 @@ constexpr int SYCL_BLOCK_SIZE = 256; // // Functions for checking and printing arrays // -void checkResult(int* c, int* c_ref, int len); +void checkResult(int* c, int* c_ref, int len); void printArray(int* v, int len); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: RAJA Vector Addition...\n"; -// -// Define vector length -// + // + // Define vector length + // constexpr int N = 1000000; -// -// Allocate and initialize vector data to random numbers in [1, 10]. -// - int *a = memoryManager::allocate(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); - int *c_ref = memoryManager::allocate(N); + // + // Allocate and initialize vector data to random numbers in [1, 10]. + // + int* a = memoryManager::allocate(N); + int* b = memoryManager::allocate(N); + int* c = memoryManager::allocate(N); + int* c_ref = memoryManager::allocate(N); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { a[i] = rand() % 10 + 1; b[i] = rand() % 10 + 1; } -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::memset(c_ref, 0, N * sizeof(int)); std::cout << "\n Running C-style sequential vector addition...\n"; // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { c_ref[i] = a[i] + b[i]; } // _cstyle_vector_add_end -//printArray(c_ref, N); + // printArray(c_ref, N); -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::seq_exec policy enforces strictly sequential execution. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); std::cout << "\n Running RAJA sequential vector addition...\n"; // _rajaseq_vector_add_start - RAJA::forall< RAJA::seq_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - } - ); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); // _rajaseq_vector_add_end checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::simd_exec policy attempts to force the compiler to generate SIMD -// vectorization optimizations. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::simd_exec policy attempts to force the compiler to generate SIMD + // vectorization optimizations. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); std::cout << "\n Running RAJA SIMD vector addition...\n"; - RAJA::forall( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - } - ); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -147,21 +143,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style OpenMP vector addition...\n"; - #pragma omp parallel for - for (int i = 0; i < N; ++i) { +#pragma omp parallel for + for (int i = 0; i < N; ++i) + { c[i] = a[i] + b[i]; } checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::omp_parallel_for_exec policy runs the loop in parallel using -// OpenMP multithreading. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::omp_parallel_for_exec policy runs the loop in parallel using + // OpenMP multithreading. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) @@ -170,21 +167,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA OpenMP multithreaded vector addition...\n"; // _rajaomp_vector_add_start - RAJA::forall< RAJA::omp_parallel_for_exec >( - RAJA::TypedRangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - } - ); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int i) { c[i] = a[i] + b[i]; }); // _rajaomp_vector_add_end checkResult(c, c_ref, N); -//printArray(c, N); +// printArray(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) @@ -192,116 +186,113 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice )); - cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice)); + cudaErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice)); // _rajacuda_vector_add_start - RAJA::forall< RAJA::cuda_exec >(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c[i] = d_a[i] + d_b[i]; - }); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajacuda_vector_add_end - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printArray(c, N); + // printArray(c, N); -//----------------------------------------------------------------------------// -// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a -// GPU device with 2 blocks per SM. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a + // GPU device with 2 blocks per SM. + //----------------------------------------------------------------------------// std::memset(c, 0, N * sizeof(int)); - std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; + std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector " + "addition...\n"; // _rajacuda_explicit_vector_add_start const bool Asynchronous = true; - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c[i] = d_a[i] + d_b[i]; - }); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajacuda_explicit_vector_add_end - cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + cudaErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); +// printResult(c, N); #endif -//----------------------------------------------------------------------------// -// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy(d_a, a, N * sizeof(int), hipMemcpyHostToDevice)); + hipErrchk(hipMemcpy(d_b, b, N * sizeof(int), hipMemcpyHostToDevice)); // _rajahip_vector_add_start - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c[i] = d_a[i] + d_b[i]; - }); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajahip_vector_add_end - hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy(c, d_c, N * sizeof(int), hipMemcpyDeviceToHost)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// -// RAJA::sycl_exec policy runs the loop as a SYCL kernel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA::sycl_exec policy runs the loop as a SYCL kernel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) std::cout << "\n Running RAJA SYCL vector addition...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); + int* d_a = memoryManager::allocate_gpu(N); + int* d_b = memoryManager::allocate_gpu(N); + int* d_c = memoryManager::allocate_gpu(N); memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int)); memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); // _rajasycl_vector_add_start - RAJA::forall>(RAJA::TypedRangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c[i] = d_a[i] + d_b[i]; - }); + RAJA::forall>( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int i) { d_c[i] = d_a[i] + d_b[i]; }); // _rajasycl_vector_add_end memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); checkResult(c, c_ref, N); -//printResult(c, N); + // printResult(c, N); memoryManager::deallocate_gpu(d_a); memoryManager::deallocate_gpu(d_b); memoryManager::deallocate_gpu(d_c); #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// -// -// Clean up. -// + // + // Clean up. + // memoryManager::deallocate(a); memoryManager::deallocate(b); memoryManager::deallocate(c); @@ -318,12 +309,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(int* c, int* c_ref, int len) { bool correct = true; - for (int i = 0; i < len; i++) { - if ( correct && c[i] != c_ref[i] ) { correct = false; } + for (int i = 0; i < len; i++) + { + if (correct && c[i] != c_ref[i]) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -334,9 +332,9 @@ void checkResult(int* c, int* c_ref, int len) void printArray(int* v, int len) { std::cout << std::endl; - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { std::cout << "v[" << i << "] = " << v[i] << std::endl; } std::cout << std::endl; } - diff --git a/exercises/vertexsum-indexset.cpp b/exercises/vertexsum-indexset.cpp index 258250a741..028293a7f9 100644 --- a/exercises/vertexsum-indexset.cpp +++ b/exercises/vertexsum-indexset.cpp @@ -20,7 +20,7 @@ /* * Mesh vertex area exercise * - * In this exercise, you will use a RAJA TypedIndexSet containing 4 + * In this exercise, you will use a RAJA TypedIndexSet containing 4 * TypedListSegments to parallelize the mesh vertex area computation. * A sum is computed at each vertex on a logically-Cartesian 2D mesh * where the sum represents the vertex "area" as an average of the 4 @@ -32,13 +32,13 @@ * each subset. When the ListSegments are put into an IndexSet, the entire * computation can be executed with one RAJA::forall() statement, where * you iterate over the segments sequentially and execute each segment in - * parallel. This exercise illustrates how RAJA can be used to enable one + * parallel. This exercise illustrates how RAJA can be used to enable one * to get some parallelism from such operations without fundamentally * changing the way the algorithm looks in source code. * * This file contains sequential and OpenMP variants of the vertex area - * computation using C-style for-loops. You will fill in RAJA versions of - * these variants, plus a RAJA CUDA version if you have access to an NVIDIA + * computation using C-style for-loops. You will fill in RAJA versions of + * these variants, plus a RAJA CUDA version if you have access to an NVIDIA * GPU and a CUDA compiler, in empty code sections indicated by comments. * * RAJA features you will use: @@ -68,189 +68,204 @@ void checkResult(double* a, double* aref, int n); void printMeshData(double* v, int n, int joff); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n"; -// _vertexsum_define_start -// -// 2D mesh has N^2 elements (N+1)^2 vertices. -// + // _vertexsum_define_start + // + // 2D mesh has N^2 elements (N+1)^2 vertices. + // constexpr int N = 1000; constexpr int Nelem = N; constexpr int Nelem_tot = Nelem * Nelem; constexpr int Nvert = N + 1; constexpr int Nvert_tot = Nvert * Nvert; -// _vertexsum_define_end + // _vertexsum_define_end double* areae = memoryManager::allocate(Nelem_tot); double* areav = memoryManager::allocate(Nvert_tot); double* areav_ref = memoryManager::allocate(Nvert_tot); - int* e2v_map = memoryManager::allocate(4*Nelem_tot); + int* e2v_map = memoryManager::allocate(4 * Nelem_tot); -// _vertexsum_elemarea_start -// -// Define mesh spacing factor 'h' and set up elem to vertex mapping array. -// + // _vertexsum_elemarea_start + // + // Define mesh spacing factor 'h' and set up elem to vertex mapping array. + // constexpr double h = 0.1; - for (int ie = 0; ie < Nelem_tot; ++ie) { + for (int ie = 0; ie < Nelem_tot; ++ie) + { int j = ie / Nelem; - int imap = 4 * ie ; + int imap = 4 * ie; e2v_map[imap] = ie + j; - e2v_map[imap+1] = ie + j + 1; - e2v_map[imap+2] = ie + j + Nvert; - e2v_map[imap+3] = ie + j + 1 + Nvert; + e2v_map[imap + 1] = ie + j + 1; + e2v_map[imap + 2] = ie + j + Nvert; + e2v_map[imap + 3] = ie + j + 1 + Nvert; } -// -// Initialize element areas so each element area -// depends on the i,j coordinates of the element. -// + // + // Initialize element areas so each element area + // depends on the i,j coordinates of the element. + // std::memset(areae, 0, Nelem_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { + for (int ie = 0; ie < Nelem_tot; ++ie) + { int i = ie % Nelem; int j = ie / Nelem; - areae[ie] = h*(i+1) * h*(j+1); + areae[ie] = h * (i + 1) * h * (j + 1); } -// _vertexsum_elemarea_end + // _vertexsum_elemarea_end -//std::cout << "\n Element areas...\n"; -//printMeshData(areae, Nelem, Nelem); + // std::cout << "\n Element areas...\n"; + // printMeshData(areae, Nelem, Nelem); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running sequential C-style version of vertex sum...\n"; -// _cstyle_vertexarea_seq_start + // _cstyle_vertexarea_seq_start std::memset(areav_ref, 0, Nvert_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { - int* iv = &(e2v_map[4*ie]); - areav_ref[ iv[0] ] += areae[ie] / 4.0 ; - areav_ref[ iv[1] ] += areae[ie] / 4.0 ; - areav_ref[ iv[2] ] += areae[ie] / 4.0 ; - areav_ref[ iv[3] ] += areae[ie] / 4.0 ; + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int* iv = &(e2v_map[4 * ie]); + areav_ref[iv[0]] += areae[ie] / 4.0; + areav_ref[iv[1]] += areae[ie] / 4.0; + areav_ref[iv[2]] += areae[ie] / 4.0; + areav_ref[iv[3]] += areae[ie] / 4.0; } -// _cstyle_vertexarea_seq_end - -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); - - -//----------------------------------------------------------------------------// -// -// In the following, we partition the element iteration space into four -// subsets (or "colors") indicated by numbers in the figure below. -// -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// -// Since none of the elements with the same number share a common vertex, -// we can iterate over each subset ("color") in parallel. -// -// We use RAJA ListSegments and a RAJA IndexSet to define the element -// partitioning. -// - -// _vertexarea_color_start -// -// Gather the element indices for each color in a vector. -// - std::vector< std::vector > idx(4); - - for (int ie = 0; ie < Nelem_tot; ++ie) { + // _cstyle_vertexarea_seq_end + + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); + + + //----------------------------------------------------------------------------// + // + // In the following, we partition the element iteration space into four + // subsets (or "colors") indicated by numbers in the figure below. + // + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // + // Since none of the elements with the same number share a common vertex, + // we can iterate over each subset ("color") in parallel. + // + // We use RAJA ListSegments and a RAJA IndexSet to define the element + // partitioning. + // + + // _vertexarea_color_start + // + // Gather the element indices for each color in a vector. + // + std::vector> idx(4); + + for (int ie = 0; ie < Nelem_tot; ++ie) + { int i = ie % Nelem; int j = ie / Nelem; - if ( i % 2 == 0 ) { - if ( j % 2 == 0 ) { + if (i % 2 == 0) + { + if (j % 2 == 0) + { idx[0].push_back(ie); - } else { + } + else + { idx[2].push_back(ie); } - } else { - if ( j % 2 == 0 ) { + } + else + { + if (j % 2 == 0) + { idx[1].push_back(ie); - } else { + } + else + { idx[3].push_back(ie); } } } -// _vertexarea_color_end + // _vertexarea_color_end -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. Note that we use the vectors -// defined above in this variant to run each element subset in parallel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. Note that we use the vectors + // defined above in this variant to run each element subset in parallel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running C-style OpenMP vertex sum...\n"; -// _cstyle_vertexarea_omp_start + // _cstyle_vertexarea_omp_start std::memset(areav, 0, Nvert_tot * sizeof(double)); - for (int icol = 0; icol < 4; ++icol) { - const std::vector& ievec = idx[icol]; - const int len = static_cast(ievec.size()); - - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - int ie = ievec[i]; - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - } - + for (int icol = 0; icol < 4; ++icol) + { + const std::vector& ievec = idx[icol]; + const int len = static_cast(ievec.size()); + +#pragma omp parallel for + for (int i = 0; i < len; ++i) + { + int ie = ievec[i]; + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + } } -// _cstyle_vertexarea_omp_end + // _cstyle_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); #endif // The IndexSet is a variadic template, where the template arguments -// are the segment types that the IndexSet can hold. -// -#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) -// _vertexarea_listsegtype_start +// are the segment types that the IndexSet can hold. +// +#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || \ + defined(RAJA_ENABLE_HIP) + // _vertexarea_listsegtype_start using SegmentType = RAJA::TypedListSegment; // _vertexarea_listsegtype_end #endif #if defined(RAJA_ENABLE_OPENMP) -// -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. -// + // + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. + // camp::resources::Resource host_res{camp::resources::Host()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet colorset; - colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); + colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res)); /// /// TODO... @@ -260,56 +275,56 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// below to check if it's correct. /// -//----------------------------------------------------------------------------// -// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration -// over segments, OpenMP parallel iteration of each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration + // over segments, OpenMP parallel iteration of each segment) + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_omp_start - using EXEC_POL1 = RAJA::ExecPolicy; + // _raja_vertexarea_omp_start + using EXEC_POL1 = + RAJA::ExecPolicy; RAJA::forall(colorset, [=](int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; }); -// _raja_vertexarea_omp_end + // _raja_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, Nvert); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, Nvert); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration -// over segments, CUDA kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration + // over segments, CUDA kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // camp::resources::Resource cuda_res{camp::resources::Cuda()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet cuda_colorset; - cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) ); + cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res)); /// /// TODO... @@ -321,84 +336,85 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_cuda_start - using EXEC_POL2 = RAJA::ExecPolicy>; + // _raja_vertexarea_cuda_start + using EXEC_POL2 = + RAJA::ExecPolicy>; - RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; + RAJA::forall(cuda_colorset, [=] RAJA_DEVICE(int ie) { + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; }); -// _raja_vertexarea_cuda_end + // _raja_vertexarea_cuda_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); #endif -//----------------------------------------------------------------------------// -// RAJA HIP vertex sum calculation using IndexSet (sequential iteration -// over segments, HIP kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP vertex sum calculation using IndexSet (sequential iteration + // over segments, HIP kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -// -// Allocate and initialize device memory arrays -// + // + // Allocate and initialize device memory arrays + // double* d_areae = memoryManager::allocate_gpu(Nelem_tot); double* d_areav = memoryManager::allocate_gpu(Nvert_tot); - int* d_e2v_map = memoryManager::allocate_gpu(4*Nelem_tot); + int* d_e2v_map = memoryManager::allocate_gpu(4 * Nelem_tot); - hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice); - hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice); + hipMemcpy( + d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice); std::memset(areav, 0, Nvert_tot * sizeof(double)); - hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice); -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // camp::resources::Resource hip_res{camp::resources::Hip()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet hip_colorset; - hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) ); + hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res)); std::cout << "\n Running RAJA HIP index set vertex sum...\n"; -// _raja_vertexarea_hip_start - using EXEC_POL3 = RAJA::ExecPolicy>; + // _raja_vertexarea_hip_start + using EXEC_POL3 = + RAJA::ExecPolicy>; - RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(d_e2v_map[4*ie]); - d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[1] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[2] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[3] ] += d_areae[ie] / 4.0 ; + RAJA::forall(hip_colorset, [=] RAJA_DEVICE(int ie) { + int* iv = &(d_e2v_map[4 * ie]); + d_areav[iv[0]] += d_areae[ie] / 4.0; + d_areav[iv[1]] += d_areae[ie] / 4.0; + d_areav[iv[2]] += d_areae[ie] / 4.0; + d_areav[iv[3]] += d_areae[ie] / 4.0; }); -// _raja_vertexarea_hip_end + // _raja_vertexarea_hip_end - hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost); + hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost); checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); memoryManager::deallocate_gpu(d_areae); memoryManager::deallocate_gpu(d_areav); @@ -406,7 +422,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(areae); @@ -425,12 +441,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(double* a, double* aref, int n) { bool correct = true; - for (int i = 0; i < n*n; i++) { - if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; } + for (int i = 0; i < n * n; i++) + { + if (correct && std::abs(a[i] - aref[i]) > 10e-12) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -441,11 +464,12 @@ void checkResult(double* a, double* aref, int n) void printMeshData(double* v, int n, int joff) { std::cout << std::endl; - for (int j = 0 ; j < n ; ++j) { - for (int i = 0 ; i < n ; ++i) { - int ii = i + j*joff ; - std::cout << "v(" << i << "," << j << ") = " - << v[ii] << std::endl; + for (int j = 0; j < n; ++j) + { + for (int i = 0; i < n; ++i) + { + int ii = i + j * joff; + std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl; } } std::cout << std::endl; diff --git a/exercises/vertexsum-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp index 5c1617343a..2861109eda 100644 --- a/exercises/vertexsum-indexset_solution.cpp +++ b/exercises/vertexsum-indexset_solution.cpp @@ -20,7 +20,7 @@ /* * Mesh vertex area exercise * - * In this exercise, you will use a RAJA TypedIndexSet containing 4 + * In this exercise, you will use a RAJA TypedIndexSet containing 4 * TypedListSegments to parallelize the mesh vertex area computation. * A sum is computed at each vertex on a logically-Cartesian 2D mesh * where the sum represents the vertex "area" as an average of the 4 @@ -32,13 +32,13 @@ * each subset. When the ListSegments are put into an IndexSet, the entire * computation can be executed with one RAJA::forall() statement, where * you iterate over the segments sequentially and execute each segment in - * parallel. This exercise illustrates how RAJA can be used to enable one + * parallel. This exercise illustrates how RAJA can be used to enable one * to get some parallelism from such operations without fundamentally * changing the way the algorithm looks in source code. * * This file contains sequential and OpenMP variants of the vertex area - * computation using C-style for-loops. You will fill in RAJA versions of - * these variants, plus a RAJA CUDA version if you have access to an NVIDIA + * computation using C-style for-loops. You will fill in RAJA versions of + * these variants, plus a RAJA CUDA version if you have access to an NVIDIA * GPU and a CUDA compiler, in empty code sections indicated by comments. * * RAJA features you will use: @@ -68,329 +68,345 @@ void checkResult(double* a, double* aref, int n); void printMeshData(double* v, int n, int joff); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n"; -// _vertexsum_define_start -// -// 2D mesh has N^2 elements (N+1)^2 vertices. -// + // _vertexsum_define_start + // + // 2D mesh has N^2 elements (N+1)^2 vertices. + // constexpr int N = 1000; constexpr int Nelem = N; constexpr int Nelem_tot = Nelem * Nelem; constexpr int Nvert = N + 1; constexpr int Nvert_tot = Nvert * Nvert; -// _vertexsum_define_end + // _vertexsum_define_end double* areae = memoryManager::allocate(Nelem_tot); double* areav = memoryManager::allocate(Nvert_tot); double* areav_ref = memoryManager::allocate(Nvert_tot); - int* e2v_map = memoryManager::allocate(4*Nelem_tot); + int* e2v_map = memoryManager::allocate(4 * Nelem_tot); -// _vertexsum_elemarea_start -// -// Define mesh spacing factor 'h' and set up elem to vertex mapping array. -// + // _vertexsum_elemarea_start + // + // Define mesh spacing factor 'h' and set up elem to vertex mapping array. + // constexpr double h = 0.1; - for (int ie = 0; ie < Nelem_tot; ++ie) { + for (int ie = 0; ie < Nelem_tot; ++ie) + { int j = ie / Nelem; - int imap = 4 * ie ; + int imap = 4 * ie; e2v_map[imap] = ie + j; - e2v_map[imap+1] = ie + j + 1; - e2v_map[imap+2] = ie + j + Nvert; - e2v_map[imap+3] = ie + j + 1 + Nvert; + e2v_map[imap + 1] = ie + j + 1; + e2v_map[imap + 2] = ie + j + Nvert; + e2v_map[imap + 3] = ie + j + 1 + Nvert; } -// -// Initialize element areas so each element area -// depends on the i,j coordinates of the element. -// + // + // Initialize element areas so each element area + // depends on the i,j coordinates of the element. + // std::memset(areae, 0, Nelem_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { + for (int ie = 0; ie < Nelem_tot; ++ie) + { int i = ie % Nelem; int j = ie / Nelem; - areae[ie] = h*(i+1) * h*(j+1); + areae[ie] = h * (i + 1) * h * (j + 1); } -// _vertexsum_elemarea_end + // _vertexsum_elemarea_end -//std::cout << "\n Element areas...\n"; -//printMeshData(areae, Nelem, Nelem); + // std::cout << "\n Element areas...\n"; + // printMeshData(areae, Nelem, Nelem); -//----------------------------------------------------------------------------// -// C-style sequential variant establishes reference solution to compare with. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style sequential variant establishes reference solution to compare with. + //----------------------------------------------------------------------------// std::cout << "\n Running sequential C-style version of vertex sum...\n"; -// _cstyle_vertexarea_seq_start + // _cstyle_vertexarea_seq_start std::memset(areav_ref, 0, Nvert_tot * sizeof(double)); - for (int ie = 0; ie < Nelem_tot; ++ie) { - int* iv = &(e2v_map[4*ie]); - areav_ref[ iv[0] ] += areae[ie] / 4.0 ; - areav_ref[ iv[1] ] += areae[ie] / 4.0 ; - areav_ref[ iv[2] ] += areae[ie] / 4.0 ; - areav_ref[ iv[3] ] += areae[ie] / 4.0 ; + for (int ie = 0; ie < Nelem_tot; ++ie) + { + int* iv = &(e2v_map[4 * ie]); + areav_ref[iv[0]] += areae[ie] / 4.0; + areav_ref[iv[1]] += areae[ie] / 4.0; + areav_ref[iv[2]] += areae[ie] / 4.0; + areav_ref[iv[3]] += areae[ie] / 4.0; } -// _cstyle_vertexarea_seq_end - -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); - - -//----------------------------------------------------------------------------// -// -// In the following, we partition the element iteration space into four -// subsets (or "colors") indicated by numbers in the figure below. -// -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// -// Since none of the elements with the same number share a common vertex, -// we can iterate over each subset ("color") in parallel. -// -// We use RAJA ListSegments and a RAJA IndexSet to define the element -// partitioning. -// - -// _vertexarea_color_start -// -// Gather the element indices for each color in a vector. -// - std::vector< std::vector > idx(4); - - for (int ie = 0; ie < Nelem_tot; ++ie) { + // _cstyle_vertexarea_seq_end + + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); + + + //----------------------------------------------------------------------------// + // + // In the following, we partition the element iteration space into four + // subsets (or "colors") indicated by numbers in the figure below. + // + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // | 2 | 3 | 2 | 3 | + // ----------------- + // | 0 | 1 | 0 | 1 | + // ----------------- + // + // Since none of the elements with the same number share a common vertex, + // we can iterate over each subset ("color") in parallel. + // + // We use RAJA ListSegments and a RAJA IndexSet to define the element + // partitioning. + // + + // _vertexarea_color_start + // + // Gather the element indices for each color in a vector. + // + std::vector> idx(4); + + for (int ie = 0; ie < Nelem_tot; ++ie) + { int i = ie % Nelem; int j = ie / Nelem; - if ( i % 2 == 0 ) { - if ( j % 2 == 0 ) { + if (i % 2 == 0) + { + if (j % 2 == 0) + { idx[0].push_back(ie); - } else { + } + else + { idx[2].push_back(ie); } - } else { - if ( j % 2 == 0 ) { + } + else + { + if (j % 2 == 0) + { idx[1].push_back(ie); - } else { + } + else + { idx[3].push_back(ie); } } } -// _vertexarea_color_end + // _vertexarea_color_end -//----------------------------------------------------------------------------// -// C-style OpenMP multithreading variant. Note that we use the vectors -// defined above in this variant to run each element subset in parallel. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // C-style OpenMP multithreading variant. Note that we use the vectors + // defined above in this variant to run each element subset in parallel. + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running C-style OpenMP vertex sum...\n"; -// _cstyle_vertexarea_omp_start + // _cstyle_vertexarea_omp_start std::memset(areav, 0, Nvert_tot * sizeof(double)); - for (int icol = 0; icol < 4; ++icol) { - const std::vector& ievec = idx[icol]; - const int len = static_cast(ievec.size()); - - #pragma omp parallel for - for (int i = 0; i < len; ++i) { - int ie = ievec[i]; - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; - } - + for (int icol = 0; icol < 4; ++icol) + { + const std::vector& ievec = idx[icol]; + const int len = static_cast(ievec.size()); + +#pragma omp parallel for + for (int i = 0; i < len; ++i) + { + int ie = ievec[i]; + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; + } } -// _cstyle_vertexarea_omp_end + // _cstyle_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex areas (reference)...\n"; -//printMeshData(areav_ref, Nvert, jvoff); + // std::cout << "\n Vertex areas (reference)...\n"; + // printMeshData(areav_ref, Nvert, jvoff); #endif // The IndexSet is a variadic template, where the template arguments -// are the segment types that the IndexSet can hold. -// -#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) -// _vertexarea_listsegtype_start +// are the segment types that the IndexSet can hold. +// +#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || \ + defined(RAJA_ENABLE_HIP) + // _vertexarea_listsegtype_start using SegmentType = RAJA::TypedListSegment; // _vertexarea_listsegtype_end #endif #if defined(RAJA_ENABLE_OPENMP) -// -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. -// + // + // Resource object used to construct list segment objects with indices + // living in host (CPU) memory. + // camp::resources::Resource host_res{camp::resources::Host()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. -// _vertexarea_indexset_start + // _vertexarea_indexset_start RAJA::TypedIndexSet colorset; - colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); - colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); - colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); - colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) ); -// _vertexarea_indexset_end + colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), host_res)); + colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), host_res)); + colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), host_res)); + colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), host_res)); + // _vertexarea_indexset_end -//----------------------------------------------------------------------------// -// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration -// over segments, OpenMP parallel iteration of each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration + // over segments, OpenMP parallel iteration of each segment) + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA OpenMP index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_omp_start - using EXEC_POL1 = RAJA::ExecPolicy; + // _raja_vertexarea_omp_start + using EXEC_POL1 = + RAJA::ExecPolicy; RAJA::forall(colorset, [=](int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; }); -// _raja_vertexarea_omp_end + // _raja_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, Nvert); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, Nvert); #endif -//----------------------------------------------------------------------------// -// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration -// over segments, CUDA kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA CUDA vertex sum calculation using IndexSet (sequential iteration + // over segments, CUDA kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // camp::resources::Resource cuda_res{camp::resources::Cuda()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet cuda_colorset; - cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) ); - cuda_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), cuda_res) ); - cuda_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), cuda_res) ); - cuda_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), cuda_res) ); + cuda_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), cuda_res)); + cuda_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), cuda_res)); + cuda_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), cuda_res)); + cuda_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), cuda_res)); std::cout << "\n Running RAJA CUDA index set vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + std::memset(areav, 0, Nvert * Nvert * sizeof(double)); -// _raja_vertexarea_cuda_start - using EXEC_POL2 = RAJA::ExecPolicy>; + // _raja_vertexarea_cuda_start + using EXEC_POL2 = + RAJA::ExecPolicy>; - RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(e2v_map[4*ie]); - areav[ iv[0] ] += areae[ie] / 4.0 ; - areav[ iv[1] ] += areae[ie] / 4.0 ; - areav[ iv[2] ] += areae[ie] / 4.0 ; - areav[ iv[3] ] += areae[ie] / 4.0 ; + RAJA::forall(cuda_colorset, [=] RAJA_DEVICE(int ie) { + int* iv = &(e2v_map[4 * ie]); + areav[iv[0]] += areae[ie] / 4.0; + areav[iv[1]] += areae[ie] / 4.0; + areav[iv[2]] += areae[ie] / 4.0; + areav[iv[3]] += areae[ie] / 4.0; }); -// _raja_vertexarea_cuda_end + // _raja_vertexarea_cuda_end checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); #endif -//----------------------------------------------------------------------------// -// RAJA HIP vertex sum calculation using IndexSet (sequential iteration -// over segments, HIP kernel launched for each segment) -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // RAJA HIP vertex sum calculation using IndexSet (sequential iteration + // over segments, HIP kernel launched for each segment) + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) -// -// Allocate and initialize device memory arrays -// + // + // Allocate and initialize device memory arrays + // double* d_areae = memoryManager::allocate_gpu(Nelem_tot); double* d_areav = memoryManager::allocate_gpu(Nvert_tot); - int* d_e2v_map = memoryManager::allocate_gpu(4*Nelem_tot); + int* d_e2v_map = memoryManager::allocate_gpu(4 * Nelem_tot); - hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice); - hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(d_areae, areae, Nelem_tot * sizeof(double), hipMemcpyHostToDevice); + hipMemcpy( + d_e2v_map, e2v_map, 4 * Nelem_tot * sizeof(int), hipMemcpyHostToDevice); std::memset(areav, 0, Nvert_tot * sizeof(double)); - hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_areav, areav, Nvert_tot * sizeof(double), hipMemcpyHostToDevice); -// -// Resource object used to construct list segment objects with indices -// living in device (GPU) memory. -// + // + // Resource object used to construct list segment objects with indices + // living in device (GPU) memory. + // camp::resources::Resource hip_res{camp::resources::Hip()}; -// -// Create a RAJA IndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA -// variants of the vertex sum calculation. + // + // Create a RAJA IndexSet with four ListSegments, one for the indices of + // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA + // variants of the vertex sum calculation. RAJA::TypedIndexSet hip_colorset; - hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) ); - hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) ); + hip_colorset.push_back(SegmentType(&idx[0][0], idx[0].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[1][0], idx[1].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[2][0], idx[2].size(), hip_res)); + hip_colorset.push_back(SegmentType(&idx[3][0], idx[3].size(), hip_res)); std::cout << "\n Running RAJA HIP index set vertex sum...\n"; -// _raja_vertexarea_hip_start - using EXEC_POL3 = RAJA::ExecPolicy>; + // _raja_vertexarea_hip_start + using EXEC_POL3 = + RAJA::ExecPolicy>; - RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { - int* iv = &(d_e2v_map[4*ie]); - d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[1] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[2] ] += d_areae[ie] / 4.0 ; - d_areav[ iv[3] ] += d_areae[ie] / 4.0 ; + RAJA::forall(hip_colorset, [=] RAJA_DEVICE(int ie) { + int* iv = &(d_e2v_map[4 * ie]); + d_areav[iv[0]] += d_areae[ie] / 4.0; + d_areav[iv[1]] += d_areae[ie] / 4.0; + d_areav[iv[2]] += d_areae[ie] / 4.0; + d_areav[iv[3]] += d_areae[ie] / 4.0; }); -// _raja_vertexarea_hip_end + // _raja_vertexarea_hip_end - hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost); + hipMemcpy(areav, d_areav, Nvert_tot * sizeof(double), hipMemcpyDeviceToHost); checkResult(areav, areav_ref, Nvert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(areav, Nvert, jvoff); + // std::cout << "\n Vertex volumes...\n"; + // printMeshData(areav, Nvert, jvoff); memoryManager::deallocate_gpu(d_areae); memoryManager::deallocate_gpu(d_areav); @@ -398,7 +414,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// // Clean up... memoryManager::deallocate(areae); @@ -417,12 +433,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) void checkResult(double* a, double* aref, int n) { bool correct = true; - for (int i = 0; i < n*n; i++) { - if ( correct && std::abs(a[i] - aref[i]) > 10e-12 ) { correct = false; } + for (int i = 0; i < n * n; i++) + { + if (correct && std::abs(a[i] - aref[i]) > 10e-12) + { + correct = false; + } } - if ( correct ) { + if (correct) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } } @@ -433,11 +456,12 @@ void checkResult(double* a, double* aref, int n) void printMeshData(double* v, int n, int joff) { std::cout << std::endl; - for (int j = 0 ; j < n ; ++j) { - for (int i = 0 ; i < n ; ++i) { - int ii = i + j*joff ; - std::cout << "v(" << i << "," << j << ") = " - << v[ii] << std::endl; + for (int j = 0; j < n; ++j) + { + for (int i = 0; i < n; ++i) + { + int ii = i + j * joff; + std::cout << "v(" << i << "," << j << ") = " << v[ii] << std::endl; } } std::cout << std::endl; diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp index 0f9383e95e..c743b84d28 100644 --- a/exercises/view-layout.cpp +++ b/exercises/view-layout.cpp @@ -22,9 +22,9 @@ * RAJA features shown: * - RAJA::View * - RAJA::Layout - * - Layout permutations + * - Layout permutations * - OffsetLayout - * - OffsetLayout permutations + * - OffsetLayout permutations * * NOTE: no RAJA kernel execution methods are used in these examples. */ @@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N); template void printValues(T* C, int N); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA view & layout exercises...\n"; -//----------------------------------------------------------------------------// -// -// Matrix-matrix multiplication: default layout -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Matrix-matrix multiplication: default layout + // + //----------------------------------------------------------------------------// // _matmult_init_start // @@ -58,84 +58,92 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate storage for matrices and initialize matrix entries // - double *A = new double[ N * N ]; - double *B = new double[ N * N ]; - double *C = new double[ N * N ]; - double *Cref = new double[ N * N ]; - - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - A[ col + N*row ] = row + 1; - B[ col + N*row ] = col + 1; - C[ col + N*row ] = 0.0; - Cref[ col + N*row ] = 0.0; + double* A = new double[N * N]; + double* B = new double[N * N]; + double* C = new double[N * N]; + double* Cref = new double[N * N]; + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + A[col + N * row] = row + 1; + B[col + N * row] = col + 1; + C[col + N * row] = 0.0; + Cref[col + N * row] = 0.0; } } // _matmult_init_end -//printValues(A, N*N); -//printValues(B, N*N); -//printValues(C, N*N); -//printValues(Cref, N*N); + // printValues(A, N*N); + // printValues(B, N*N); + // printValues(C, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication reference solution...\n"; // _cstyle_matmult_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { - Cref[col + N*row] += A[k + N*row] * B[col + N*k]; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { + Cref[col + N * row] += A[k + N * row] * B[col + N * k]; } } } // _cstyle_matmult_end -//printValues(Cref, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication w/Views...\n"; - // + // // Define RAJA View objects to simplify access to the matrix entries. - // - // Note: we use default Layout + // + // Note: we use default Layout // // _matmult_views_start - RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N); + RAJA::View> Aview(A, N, N); + RAJA::View> Bview(B, N, N); + RAJA::View> Cview(C, N, N); // _matmult_views_end // _cstyle_matmult_views_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { Cview(row, col) += Aview(row, k) * Bview(k, col); } } } // _cstyle_matmult_views_end - checkResult(C, Cref, N*N); -//printValues(C, N*N); + checkResult(C, Cref, N * N); + // printValues(C, N*N); -// -// Clean up. -// - delete [] A; - delete [] B; - delete [] C; - delete [] Cref; + // + // Clean up. + // + delete[] A; + delete[] B; + delete[] C; + delete[] Cref; -//----------------------------------------------------------------------------// -// -// Default layouts use row-major data ordering -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Default layouts use row-major data ordering + // + //----------------------------------------------------------------------------// // // Define dimensions and allocate arrays @@ -144,9 +152,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int Nx = 3; constexpr int Ny = 5; constexpr int Nz = 2; - constexpr int Ntot = Nx*Ny*Nz; - int* a = new int[ Ntot ]; - int* aref = new int[ Ntot ]; + constexpr int Ntot = Nx * Ny * Nz; + int* a = new int[Ntot]; + int* aref = new int[Ntot]; for (int i = 0; i < Ntot; ++i) { @@ -154,49 +162,52 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_views_init_end -//printValues(ref, Ntot); + // printValues(ref, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n Running default layout view cases...\n"; std::cout << "\n\t Running 1D view case...\n"; - + std::memset(a, 0, Ntot * sizeof(int)); - - // _default_view1D_start - RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot); - for (int i = 0; i < Ntot; ++i) { + // _default_view1D_start + RAJA::View> view_1D(a, Ntot); + + for (int i = 0; i < Ntot; ++i) + { view_1D(i) = i; } - // _default_view1D_end + // _default_view1D_end checkResult(a, aref, Ntot); -//printValues(a, Ntot); + // printValues(a, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); - + // _default_view2D_start - RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny); + RAJA::View> view_2D(a, Nx, Ny); int iter{0}; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { view_2D(i, j) = iter; ++iter; } } // _default_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default layout view case...\n"; @@ -205,47 +216,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// TODO... /// - /// EXERCISE: Implement a triple loop nest using a RAJA::View and + /// EXERCISE: Implement a triple loop nest using a RAJA::View and /// three-dimensional RAJA::Layout that iterates over the /// data array 'a' with unit stride. /// - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------------------------------------------// -// -// Permuted layouts change the data striding order -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Permuted layouts change the data striding order + // + //----------------------------------------------------------------------------// std::cout << "\n Running permuted layout cases...\n"; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default permutation view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _default_perm_view2D_start - std::array defperm2 {{0, 1}}; - RAJA::Layout< 2, int > defperm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2); - RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout); + std::array defperm2{{0, 1}}; + RAJA::Layout<2, int> defperm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, defperm2); + RAJA::View> defperm_view_2D(a, defperm2_layout); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { defperm_view_2D(i, j) = iter; ++iter; } } // _default_perm_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default permutation view case...\n"; @@ -258,35 +271,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// three-dimensional RAJA::Layout with the identity permutation. /// - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// -//----------------------------------------// + //----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _perm_2D_start - std::array perm2 {{1, 0}}; - RAJA::Layout< 2, int > perm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, perm2); - RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout); + std::array perm2{{1, 0}}; + RAJA::Layout<2, int> perm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, perm2); + RAJA::View> perm_view_2D(a, perm2_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int i = 0; i < Nx; ++i) + { perm_view_2D(i, j) = iter; ++iter; } } // _perm_2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D perma layout view case...\n"; @@ -297,7 +312,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// /// EXERCISE: Implement a triple loop nest using a RAJA::View and /// three-dimensional RAJA::Layout with the permutation - /// {2, 1, 0}. + /// {2, 1, 0}. /// /// Name the Layout object 'perm3a_layout' so it can be used /// with the index conversion methods in the section below. @@ -305,25 +320,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// Layout object you create here. /// - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D permb layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _permb_view3D_start - std::array perm3b {{1, 2, 0}}; - RAJA::Layout< 3, int > perm3b_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b); - RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout); + std::array perm3b{{1, 2, 0}}; + RAJA::Layout<3, int> perm3b_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b); + RAJA::View> perm3b_view_3D(a, perm3b_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { + for (int i = 0; i < Nx; ++i) + { perm3b_view_3D(i, j, k) = iter; ++iter; } @@ -331,29 +349,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _permb_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -// -// Clean up. -// - delete [] a; - delete [] aref; + // + // Clean up. + // + delete[] a; + delete[] aref; -//----------------------------------------------------------------------------// -// -// Layouts: multi-dimensional indices vs. linear indicies -// -// RAJA::Layout type has methods that can be used to convert between -// multi-dimensional and linear indices. We show these below using the -// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz -// sizes defined earlier: -// -// constexpr int Nx = 3; -// constexpr int Ny = 5; -// constexpr int Nz = 2; -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Layouts: multi-dimensional indices vs. linear indicies + // + // RAJA::Layout type has methods that can be used to convert between + // multi-dimensional and linear indices. We show these below using the + // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz + // sizes defined earlier: + // + // constexpr int Nx = 3; + // constexpr int Ny = 5; + // constexpr int Nz = 2; + // + //----------------------------------------------------------------------------// std::cout << "\n Multi-dimensional indices to linear indices...\n"; @@ -361,44 +379,44 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\nperm3a_layout...\n" << std::endl; int lin = -1; - int i = -1; - int j = -1; - int k = -1; + int i = -1; + int j = -1; + int k = -1; -/* - // _perm3d_layout_start - lin = perm3a_layout(1, 2, 0); - std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl; - std::cout << "\t Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny " - << "(since perm is {2, 1, 0})" << std::endl; + /* + // _perm3d_layout_start + lin = perm3a_layout(1, 2, 0); + std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl; + std::cout << "\t Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; - perm3a_layout.toIndices(7, i, j, k); - std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; - // _perm3d_layout_end + perm3a_layout.toIndices(7, i, j, k); + std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + // _perm3d_layout_end - lin = perm3a_layout(2, 3, 1); - std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl; - std::cout << "\t Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny " - << "(since perm is {2, 1, 0})" << std::endl; + lin = perm3a_layout(2, 3, 1); + std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl; + std::cout << "\t Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; - perm3a_layout.toIndices(26, i, j, k); - std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + perm3a_layout.toIndices(26, i, j, k); + std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; - lin = perm3a_layout(0, 2, 1); - std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl; - std::cout << "\t Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny " - << "(since perm is {2, 1, 0})" << std::endl; + lin = perm3a_layout(0, 2, 1); + std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl; + std::cout << "\t Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; - perm3a_layout.toIndices(21, i, j, k); - std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; -*/ + perm3a_layout.toIndices(21, i, j, k); + std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + */ -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\nperm3b_layout...\n" << std::endl; @@ -409,7 +427,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(13, i, j, k); std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(2, 3, 1); @@ -419,7 +438,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(23, i, j, k); std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(0, 2, 1); @@ -428,7 +448,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) << "(since perm is {1, 2, 0})" << std::endl; perm3b_layout.toIndices(15, i, j, k); std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; /// /// TODO... @@ -438,11 +459,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// data array 'a' with unit stride. /// -//----------------------------------------------------------------------------// -// -// Offset layouts apply offsets to indices -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Offset layouts apply offsets to indices + // + //----------------------------------------------------------------------------// std::cout << "\n Running offset layout cases...\n"; @@ -450,10 +471,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define some dimensions, and allocate arrays // constexpr int Ntot_ao = 40; - int* ao = new int[ Ntot_ao ]; - int* ao_ref = new int[ Ntot_ao ]; + int* ao = new int[Ntot_ao]; + int* ao_ref = new int[Ntot_ao]; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 1D offset layout case...\n"; @@ -467,33 +488,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int imin = -5; int imax = 6; - for (int i = imin; i < imax; ++i) { - ao_ref[ i-imin ] = i; + for (int i = imin; i < imax; ++i) + { + ao_ref[i - imin] = i; } // _cstyle_offlayout1D_end -//printValues(ao_ref, imax-imin); + // printValues(ao_ref, imax-imin); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout1D_start - RAJA::OffsetLayout<1, int> offlayout_1D = - RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); + RAJA::OffsetLayout<1, int> offlayout_1D = + RAJA::make_offset_layout<1, int>({{imin}}, {{imax}}); - RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, - offlayout_1D); + RAJA::View> aoview_1Doff(ao, offlayout_1D); - for (int i = imin; i < imax; ++i) { + for (int i = imin; i < imax; ++i) + { aoview_1Doff(i) = i; } // _raja_offlayout1D_end - checkResult(ao, ao_ref, imax-imin); -//printValues(ao, 11); + checkResult(ao, ao_ref, imax - imin); + // printValues(ao, 11); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D offset layout case...\n"; @@ -510,17 +532,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int jmax = 5; iter = 0; - for (int i = imin; i < imax; ++i) { - for (int j = jmin; j < jmax; ++j) { - ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin) ] = iter; + for (int i = imin; i < imax; ++i) + { + for (int j = jmin; j < jmax; ++j) + { + ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter; iter++; } } // _cstyle_offlayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); @@ -532,10 +556,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// same operations as the C-style example above. /// - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted offset layout case...\n"; @@ -547,50 +571,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _cstyle_permofflayout2D_start iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - ao_ref[ (i-imin) + (j-jmin) * (imax-imin) ] = iter; + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter; iter++; } } // _cstyle_permofflayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_permofflayout2D_start - std::array perm1D {{1, 0}}; - RAJA::OffsetLayout<2> permofflayout_2D = - RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, - {{imax, jmax}}, - perm1D ); + std::array perm1D{{1, 0}}; + RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>( + {{imin, jmin}}, {{imax, jmax}}, perm1D); - RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, - permofflayout_2D); + RAJA::View> aoview_2Dpermoff(ao, permofflayout_2D); iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { aoview_2Dpermoff(i, j) = iter; iter++; } } // _raja_permofflayout2D_end - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -// -// Clean up. -// - delete [] ao; - delete [] ao_ref; + // + // Clean up. + // + delete[] ao; + delete[] ao_ref; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; @@ -604,14 +629,19 @@ template void checkResult(T* C, T* Cref, int N) { bool match = true; - for (int i = 0; i < N; ++i) { - if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) { + for (int i = 0; i < N; ++i) + { + if (std::abs(C[i] - Cref[i]) > 10e-12) + { match = false; } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -619,7 +649,8 @@ void checkResult(T* C, T* Cref, int N) template void printValues(T* C, int N) { - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { std::cout << "array[" << i << "] = " << C[i] << std::endl; - } + } }; diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp index 7614c993a8..cb87b84aa4 100644 --- a/exercises/view-layout_solution.cpp +++ b/exercises/view-layout_solution.cpp @@ -22,9 +22,9 @@ * RAJA features shown: * - RAJA::View * - RAJA::Layout - * - Layout permutations + * - Layout permutations * - OffsetLayout - * - OffsetLayout permutations + * - OffsetLayout permutations * * NOTE: no RAJA kernel execution methods are used in these examples. */ @@ -38,16 +38,16 @@ void checkResult(T* C, T* Cref, int N); template void printValues(T* C, int N); -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA view & layout exercises...\n"; -//----------------------------------------------------------------------------// -// -// Matrix-matrix multiplication: default layout -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Matrix-matrix multiplication: default layout + // + //----------------------------------------------------------------------------// // _matmult_init_start // @@ -58,84 +58,92 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Allocate storage for matrices and initialize matrix entries // - double *A = new double[ N * N ]; - double *B = new double[ N * N ]; - double *C = new double[ N * N ]; - double *Cref = new double[ N * N ]; - - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - A[ col + N*row ] = row + 1; - B[ col + N*row ] = col + 1; - C[ col + N*row ] = 0.0; - Cref[ col + N*row ] = 0.0; + double* A = new double[N * N]; + double* B = new double[N * N]; + double* C = new double[N * N]; + double* Cref = new double[N * N]; + + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + A[col + N * row] = row + 1; + B[col + N * row] = col + 1; + C[col + N * row] = 0.0; + Cref[col + N * row] = 0.0; } } // _matmult_init_end -//printValues(A, N*N); -//printValues(B, N*N); -//printValues(C, N*N); -//printValues(Cref, N*N); + // printValues(A, N*N); + // printValues(B, N*N); + // printValues(C, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication reference solution...\n"; // _cstyle_matmult_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { - Cref[col + N*row] += A[k + N*row] * B[col + N*k]; + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { + Cref[col + N * row] += A[k + N * row] * B[col + N * k]; } } } // _cstyle_matmult_end -//printValues(Cref, N*N); + // printValues(Cref, N*N); -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n Running matrix multiplication w/Views...\n"; - // + // // Define RAJA View objects to simplify access to the matrix entries. - // - // Note: we use default Layout + // + // Note: we use default Layout // // _matmult_views_start - RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N); - RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N); + RAJA::View> Aview(A, N, N); + RAJA::View> Bview(B, N, N); + RAJA::View> Cview(C, N, N); // _matmult_views_end // _cstyle_matmult_views_start - for (int row = 0; row < N; ++row) { - for (int col = 0; col < N; ++col) { - for (int k = 0; k < N; ++k) { + for (int row = 0; row < N; ++row) + { + for (int col = 0; col < N; ++col) + { + for (int k = 0; k < N; ++k) + { Cview(row, col) += Aview(row, k) * Bview(k, col); } } } // _cstyle_matmult_views_end - checkResult(C, Cref, N*N); -//printValues(C, N*N); + checkResult(C, Cref, N * N); + // printValues(C, N*N); -// -// Clean up. -// - delete [] A; - delete [] B; - delete [] C; - delete [] Cref; + // + // Clean up. + // + delete[] A; + delete[] B; + delete[] C; + delete[] Cref; -//----------------------------------------------------------------------------// -// -// Default layouts use row-major data ordering -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Default layouts use row-major data ordering + // + //----------------------------------------------------------------------------// // // Define dimensions and allocate arrays @@ -144,9 +152,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int Nx = 3; constexpr int Ny = 5; constexpr int Nz = 2; - constexpr int Ntot = Nx*Ny*Nz; - int* a = new int[ Ntot ]; - int* aref = new int[ Ntot ]; + constexpr int Ntot = Nx * Ny * Nz; + int* a = new int[Ntot]; + int* aref = new int[Ntot]; for (int i = 0; i < Ntot; ++i) { @@ -154,61 +162,67 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_views_init_end -//printValues(ref, Ntot); + // printValues(ref, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n Running default layout view cases...\n"; std::cout << "\n\t Running 1D view case...\n"; - + std::memset(a, 0, Ntot * sizeof(int)); - - // _default_view1D_start - RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot); - for (int i = 0; i < Ntot; ++i) { + // _default_view1D_start + RAJA::View> view_1D(a, Ntot); + + for (int i = 0; i < Ntot; ++i) + { view_1D(i) = i; } - // _default_view1D_end + // _default_view1D_end checkResult(a, aref, Ntot); -//printValues(a, Ntot); + // printValues(a, Ntot); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); - + // _default_view2D_start - RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny); + RAJA::View> view_2D(a, Nx, Ny); int iter{0}; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { view_2D(i, j) = iter; ++iter; } } // _default_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); - // _default_view3D_start - RAJA::View< int, RAJA::Layout<3, int> > view_3D(a, Nx, Ny, Nz); + // _default_view3D_start + RAJA::View> view_3D(a, Nx, Ny, Nz); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { view_3D(i, j, k) = iter; ++iter; } @@ -216,57 +230,62 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------------------------------------------// -// -// Permuted layouts change the data striding order -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Permuted layouts change the data striding order + // + //----------------------------------------------------------------------------// std::cout << "\n Running permuted layout cases...\n"; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D default permutation view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _default_perm_view2D_start - std::array defperm2 {{0, 1}}; - RAJA::Layout< 2, int > defperm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2); - RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout); + std::array defperm2{{0, 1}}; + RAJA::Layout<2, int> defperm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, defperm2); + RAJA::View> defperm_view_2D(a, defperm2_layout); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { defperm_view_2D(i, j) = iter; ++iter; } } // _default_perm_view2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D default permutation view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _default_perm_view3D_start - std::array defperm3 {{0, 1, 2}}; - RAJA::Layout< 3, int > defperm3_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, defperm3); - RAJA::View< int, RAJA::Layout<3, int> > defperm_view_3D(a, defperm3_layout); + std::array defperm3{{0, 1, 2}}; + RAJA::Layout<3, int> defperm3_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, defperm3); + RAJA::View> defperm_view_3D(a, defperm3_layout); iter = 0; - for (int i = 0; i < Nx; ++i) { - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { + for (int i = 0; i < Nx; ++i) + { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { defperm_view_3D(i, j, k) = iter; ++iter; } @@ -274,50 +293,55 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _default_perm_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// -//----------------------------------------// + //----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _perm_2D_start - std::array perm2 {{1, 0}}; - RAJA::Layout< 2, int > perm2_layout = - RAJA::make_permuted_layout( {{Nx, Ny}}, perm2); - RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout); + std::array perm2{{1, 0}}; + RAJA::Layout<2, int> perm2_layout = + RAJA::make_permuted_layout({{Nx, Ny}}, perm2); + RAJA::View> perm_view_2D(a, perm2_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int i = 0; i < Nx; ++i) + { perm_view_2D(i, j) = iter; ++iter; } } // _perm_2D_end - checkResult(a, aref, Nx*Ny); -//printValues(a, Nx*Ny); + checkResult(a, aref, Nx * Ny); + // printValues(a, Nx*Ny); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D perma layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _perma_view3D_start - std::array perm3a {{2, 1, 0}}; - RAJA::Layout< 3, int > perm3a_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3a); - RAJA::View< int, RAJA::Layout<3, int> > perm3a_view_3D(a, perm3a_layout); + std::array perm3a{{2, 1, 0}}; + RAJA::Layout<3, int> perm3a_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3a); + RAJA::View> perm3a_view_3D(a, perm3a_layout); iter = 0; - for (int k = 0; k < Nz; ++k) { - for (int j = 0; j < Ny; ++j) { - for (int i = 0; i < Nx; ++i) { + for (int k = 0; k < Nz; ++k) + { + for (int j = 0; j < Ny; ++j) + { + for (int i = 0; i < Nx; ++i) + { perm3a_view_3D(i, j, k) = iter; ++iter; } @@ -325,25 +349,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _perma_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 3D permb layout view case...\n"; std::memset(a, 0, Ntot * sizeof(int)); // _permb_view3D_start - std::array perm3b {{1, 2, 0}}; - RAJA::Layout< 3, int > perm3b_layout = - RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b); - RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout); + std::array perm3b{{1, 2, 0}}; + RAJA::Layout<3, int> perm3b_layout = + RAJA::make_permuted_layout({{Nx, Ny, Nz}}, perm3b); + RAJA::View> perm3b_view_3D(a, perm3b_layout); iter = 0; - for (int j = 0; j < Ny; ++j) { - for (int k = 0; k < Nz; ++k) { - for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) + { + for (int k = 0; k < Nz; ++k) + { + for (int i = 0; i < Nx; ++i) + { perm3b_view_3D(i, j, k) = iter; ++iter; } @@ -351,29 +378,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } // _permb_view3D_end - checkResult(a, aref, Nx*Ny*Nz); -//printValues(a, Nx*Ny*Nz); + checkResult(a, aref, Nx * Ny * Nz); + // printValues(a, Nx*Ny*Nz); -// -// Clean up. -// - delete [] a; - delete [] aref; + // + // Clean up. + // + delete[] a; + delete[] aref; -//----------------------------------------------------------------------------// -// -// Layouts: multi-dimensional indices vs. linear indicies -// -// RAJA::Layout type has methods that can be used to convert between -// multi-dimensional and linear indices. We show these below using the -// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz -// sizes defined earlier: -// -// constexpr int Nx = 3; -// constexpr int Ny = 5; -// constexpr int Nz = 2; -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Layouts: multi-dimensional indices vs. linear indicies + // + // RAJA::Layout type has methods that can be used to convert between + // multi-dimensional and linear indices. We show these below using the + // three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz + // sizes defined earlier: + // + // constexpr int Nx = 3; + // constexpr int Ny = 5; + // constexpr int Nz = 2; + // + //----------------------------------------------------------------------------// std::cout << "\n Multi-dimensional indices to linear indices...\n"; @@ -393,7 +420,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3a_layout.toIndices(7, i, j, k); std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; // _perm3d_layout_end @@ -404,7 +432,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3a_layout.toIndices(26, i, j, k); std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3a_layout(0, 2, 1); @@ -414,9 +443,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3a_layout.toIndices(21, i, j, k); std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\nperm3b_layout...\n" << std::endl; @@ -427,7 +457,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(13, i, j, k); std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(2, 3, 1); @@ -437,7 +468,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(23, i, j, k); std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; lin = perm3b_layout(0, 2, 1); @@ -447,13 +479,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) perm3b_layout.toIndices(15, i, j, k); std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = " - << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + << "(" << i << ", " << j << ", " << k << ")\n" + << std::endl; -//----------------------------------------------------------------------------// -// -// Offset layouts apply offsets to indices -// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + // + // Offset layouts apply offsets to indices + // + //----------------------------------------------------------------------------// std::cout << "\n Running offset layout cases...\n"; @@ -461,10 +494,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Define some dimensions, and allocate arrays // constexpr int Ntot_ao = 40; - int* ao = new int[ Ntot_ao ]; - int* ao_ref = new int[ Ntot_ao ]; + int* ao = new int[Ntot_ao]; + int* ao_ref = new int[Ntot_ao]; -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 1D offset layout case...\n"; @@ -478,33 +511,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int imin = -5; int imax = 6; - for (int i = imin; i < imax; ++i) { - ao_ref[ i-imin ] = i; + for (int i = imin; i < imax; ++i) + { + ao_ref[i - imin] = i; } // _cstyle_offlayout1D_end -//printValues(ao_ref, imax-imin); + // printValues(ao_ref, imax-imin); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout1D_start - RAJA::OffsetLayout<1, int> offlayout_1D = - RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); + RAJA::OffsetLayout<1, int> offlayout_1D = + RAJA::make_offset_layout<1, int>({{imin}}, {{imax}}); - RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, - offlayout_1D); + RAJA::View> aoview_1Doff(ao, offlayout_1D); - for (int i = imin; i < imax; ++i) { + for (int i = imin; i < imax; ++i) + { aoview_1Doff(i) = i; } // _raja_offlayout1D_end - checkResult(ao, ao_ref, imax-imin); -//printValues(ao, 11); + checkResult(ao, ao_ref, imax - imin); + // printValues(ao, 11); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D offset layout case...\n"; @@ -521,39 +555,42 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int jmax = 5; iter = 0; - for (int i = imin; i < imax; ++i) { - for (int j = jmin; j < jmax; ++j) { - ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin) ] = iter; + for (int i = imin; i < imax; ++i) + { + for (int j = jmin; j < jmax; ++j) + { + ao_ref[(j - jmin) + (i - imin) * (jmax - jmin)] = iter; iter++; } } // _cstyle_offlayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_offlayout2D_start RAJA::OffsetLayout<2, int> offlayout_2D = - RAJA::make_offset_layout<2, int>( {{imin, jmin}}, {{imax, jmax}} ); + RAJA::make_offset_layout<2, int>({{imin, jmin}}, {{imax, jmax}}); - RAJA::View< int, RAJA::OffsetLayout<2, int> > aoview_2Doff(ao, - offlayout_2D); + RAJA::View> aoview_2Doff(ao, offlayout_2D); iter = 0; - for (int i = imin; i < imax; ++i) { - for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) + { + for (int j = jmin; j < jmax; ++j) + { aoview_2Doff(i, j) = iter; iter++; } } // _raja_offlayout2D_end - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::cout << "\n\t Running 2D permuted offset layout case...\n"; @@ -565,50 +602,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // _cstyle_permofflayout2D_start iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { - ao_ref[ (i-imin) + (j-jmin) * (imax-imin) ] = iter; + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { + ao_ref[(i - imin) + (j - jmin) * (imax - imin)] = iter; iter++; } } // _cstyle_permofflayout2D_end -//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + // printValues(ao_ref, (imax-imin)*(jmax-jmin)); -//----------------------------------------// + //----------------------------------------// std::memset(ao, 0, Ntot_ao * sizeof(int)); // _raja_permofflayout2D_start - std::array perm1D {{1, 0}}; - RAJA::OffsetLayout<2> permofflayout_2D = - RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, - {{imax, jmax}}, - perm1D ); + std::array perm1D{{1, 0}}; + RAJA::OffsetLayout<2> permofflayout_2D = RAJA::make_permuted_offset_layout<2>( + {{imin, jmin}}, {{imax, jmax}}, perm1D); - RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, - permofflayout_2D); + RAJA::View> aoview_2Dpermoff(ao, permofflayout_2D); iter = 0; - for (int j = jmin; j < jmax; ++j) { - for (int i = imin; i < imax; ++i) { + for (int j = jmin; j < jmax; ++j) + { + for (int i = imin; i < imax; ++i) + { aoview_2Dpermoff(i, j) = iter; iter++; } } // _raja_permofflayout2D_end - checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); -//printValues(ao, (imax-imin)*(jmax-jmin)); + checkResult(ao, ao_ref, (imax - imin) * (jmax - jmin)); + // printValues(ao, (imax-imin)*(jmax-jmin)); -// -// Clean up. -// - delete [] ao; - delete [] ao_ref; + // + // Clean up. + // + delete[] ao; + delete[] ao_ref; -//----------------------------------------------------------------------------// -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; @@ -622,14 +660,19 @@ template void checkResult(T* C, T* Cref, int N) { bool match = true; - for (int i = 0; i < N; ++i) { - if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) { + for (int i = 0; i < N; ++i) + { + if (std::abs(C[i] - Cref[i]) > 10e-12) + { match = false; } } - if ( match ) { + if (match) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL\n"; } }; @@ -637,7 +680,8 @@ void checkResult(T* C, T* Cref, int N) template void printValues(T* C, int N) { - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) + { std::cout << "array[" << i << "] = " << C[i] << std::endl; - } + } }; diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index 59cca4bf22..062be6c1bb 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -88,7 +88,7 @@ #endif #if defined(RAJA_ENABLE_DESUL_ATOMICS) - #include "RAJA/policy/desul.hpp" +#include "RAJA/policy/desul.hpp" #endif #include "RAJA/index/IndexSet.hpp" @@ -197,11 +197,13 @@ #include "RAJA/pattern/sort.hpp" -namespace RAJA { -namespace expt{} +namespace RAJA +{ +namespace expt +{} // // provide a RAJA::expt namespace for experimental work, but bring alias // // it into RAJA so it doesn't affect user code // using namespace expt; -} +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp index 1a467c8341..45f6777d93 100644 --- a/include/RAJA/index/IndexSet.hpp +++ b/include/RAJA/index/IndexSet.hpp @@ -34,8 +34,16 @@ namespace RAJA { -enum PushEnd { PUSH_FRONT, PUSH_BACK }; -enum PushCopy { PUSH_COPY, PUSH_NOCOPY }; +enum PushEnd +{ + PUSH_FRONT, + PUSH_BACK +}; +enum PushCopy +{ + PUSH_COPY, + PUSH_NOCOPY +}; template class TypedIndexSet; @@ -55,13 +63,14 @@ namespace indexset template struct ExecPolicy : public RAJA::make_policy_pattern_t { + RAJA::Pattern::forall> +{ using seg_it = SEG_ITER_POLICY_T; using seg_exec = SEG_EXEC_POLICY_T; }; -} // end namespace indexset -} // end namespace policy +} // end namespace indexset +} // end namespace policy using policy::indexset::ExecPolicy; @@ -91,7 +100,7 @@ class TypedIndexSet : public TypedIndexSet //! Construct empty index set #if _MSC_VER < 1910 - // this one instance of constexpr does not work on VS2012 or VS2015 + // this one instance of constexpr does not work on VS2012 or VS2015 RAJA_INLINE TypedIndexSet() : PARENT() {} #else RAJA_INLINE constexpr TypedIndexSet() : PARENT() {} @@ -99,12 +108,12 @@ class TypedIndexSet : public TypedIndexSet //! Copy-constructor for index set RAJA_INLINE - TypedIndexSet(TypedIndexSet const &c) - : PARENT((PARENT const &)c) + TypedIndexSet(TypedIndexSet const& c) : PARENT((PARENT const&)c) { size_t num = c.data.size(); data.resize(num); - for (size_t i = 0; i < num; ++i) { + for (size_t i = 0; i < num; ++i) + { data[i] = c.data[i]; } // mark all as not owned by us @@ -112,9 +121,10 @@ class TypedIndexSet : public TypedIndexSet } //! Copy-assignment operator for index set - TypedIndexSet &operator=(const TypedIndexSet &rhs) + TypedIndexSet& operator=(const TypedIndexSet& rhs) { - if (&rhs != this) { + if (&rhs != this) + { TypedIndexSet copy(rhs); this->swap(copy); } @@ -125,19 +135,21 @@ class TypedIndexSet : public TypedIndexSet RAJA_INLINE ~TypedIndexSet() { size_t num_seg = data.size(); - for (size_t i = 0; i < num_seg; ++i) { + for (size_t i = 0; i < num_seg; ++i) + { // Only free segment of we allocated it - if (owner[i]) { + if (owner[i]) + { delete data[i]; } } } //! Swap function for copy-and-swap idiom. - void swap(TypedIndexSet &other) + void swap(TypedIndexSet& other) { // Swap parents data - PARENT::swap((PARENT &)other); + PARENT::swap((PARENT&)other); // Swap our data using std::swap; swap(data, other.data); @@ -150,18 +162,20 @@ class TypedIndexSet : public TypedIndexSet /// This is used to implement the == and != operators /// template - RAJA_INLINE bool compareSegmentById( - size_t segid, - const TypedIndexSet &other) const + RAJA_INLINE bool + compareSegmentById(size_t segid, + const TypedIndexSet& other) const { // drill down our types until we have the right type - if (getSegmentTypes()[segid] != T0_TypeId) { + if (getSegmentTypes()[segid] != T0_TypeId) + { // peel off T0 return PARENT::compareSegmentById(segid, other); } // Check that other's segid is of type T0 - if (!other.template checkSegmentType(segid)) { + if (!other.template checkSegmentType(segid)) + { return false; } @@ -174,7 +188,8 @@ class TypedIndexSet : public TypedIndexSet template RAJA_INLINE bool checkSegmentType(size_t segid) const { - if (getSegmentTypes()[segid] == T0_TypeId) { + if (getSegmentTypes()[segid] == T0_TypeId) + { return std::is_same::value; } return PARENT::template checkSegmentType(segid); @@ -183,22 +198,24 @@ class TypedIndexSet : public TypedIndexSet //! get specified segment by ID template - RAJA_INLINE P0 &getSegment(size_t segid) + RAJA_INLINE P0& getSegment(size_t segid) { - if (getSegmentTypes()[segid] == T0_TypeId) { + if (getSegmentTypes()[segid] == T0_TypeId) + { Index_type offset = getSegmentOffsets()[segid]; - return *reinterpret_cast(data[offset]); + return *reinterpret_cast(data[offset]); } return PARENT::template getSegment(segid); } //! get specified segment by ID template - RAJA_INLINE P0 const &getSegment(size_t segid) const + RAJA_INLINE P0 const& getSegment(size_t segid) const { - if (getSegmentTypes()[segid] == T0_TypeId) { + if (getSegmentTypes()[segid] == T0_TypeId) + { Index_type offset = getSegmentOffsets()[segid]; - return *reinterpret_cast(data[offset]); + return *reinterpret_cast(data[offset]); } return PARENT::template getSegment(segid); } @@ -231,20 +248,25 @@ class TypedIndexSet : public TypedIndexSet private: template - RAJA_INLINE void push_into(TypedIndexSet &c, + RAJA_INLINE void push_into(TypedIndexSet& c, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY) { Index_type num = getNumSegments(); - if (pend == PUSH_BACK) { - for (Index_type i = 0; i < num; ++i) { + if (pend == PUSH_BACK) + { + for (Index_type i = 0; i < num; ++i) + { segment_push_into(i, c, pend, pcopy); - } - } else { - for (Index_type i = num-1; i > -1; --i) { + } + } + else + { + for (Index_type i = num - 1; i > -1; --i) + { segment_push_into(i, c, pend, pcopy); - } + } } } @@ -257,58 +279,64 @@ class TypedIndexSet : public TypedIndexSet public: template RAJA_INLINE void segment_push_into(size_t segid, - TypedIndexSet &c, + TypedIndexSet& c, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY) { - if (getSegmentTypes()[segid] != T0_TypeId) { + if (getSegmentTypes()[segid] != T0_TypeId) + { PARENT::segment_push_into(segid, c, pend, pcopy); return; } Index_type offset = getSegmentOffsets()[segid]; - switch (value_for(pend, pcopy)) { - case value_for(PUSH_BACK, PUSH_COPY): - c.push_back(*data[offset]); - break; - case value_for(PUSH_BACK, PUSH_NOCOPY): - c.push_back_nocopy(data[offset]); - break; - case value_for(PUSH_FRONT, PUSH_COPY): - c.push_front(*data[offset]); - break; - case value_for(PUSH_FRONT, PUSH_NOCOPY): - c.push_front_nocopy(data[offset]); - break; + switch (value_for(pend, pcopy)) + { + case value_for(PUSH_BACK, PUSH_COPY): + c.push_back(*data[offset]); + break; + case value_for(PUSH_BACK, PUSH_NOCOPY): + c.push_back_nocopy(data[offset]); + break; + case value_for(PUSH_FRONT, PUSH_COPY): + c.push_front(*data[offset]); + break; + case value_for(PUSH_FRONT, PUSH_NOCOPY): + c.push_front_nocopy(data[offset]); + break; } } //! Add segment to back end of index set without making a copy. template - RAJA_INLINE void push_back_nocopy(Tnew *val) + RAJA_INLINE void push_back_nocopy(Tnew* val) { push_internal(val, PUSH_BACK, PUSH_NOCOPY); } //! Add segment to front end of index set without making a copy. template - RAJA_INLINE void push_front_nocopy(Tnew *val) + RAJA_INLINE void push_front_nocopy(Tnew* val) { push_internal(val, PUSH_FRONT, PUSH_NOCOPY); } //! Add copy of segment to back end of index set. template - RAJA_INLINE void push_back(Tnew &&val) + RAJA_INLINE void push_back(Tnew&& val) { - push_internal(new typename std::decay::type(std::forward(val)), PUSH_BACK, PUSH_COPY); + push_internal(new typename std::decay::type(std::forward(val)), + PUSH_BACK, + PUSH_COPY); } //! Add copy of segment to front end of index set. template - RAJA_INLINE void push_front(Tnew &&val) + RAJA_INLINE void push_front(Tnew&& val) { - push_internal(new typename std::decay::type(std::forward(val)), PUSH_FRONT, PUSH_COPY); + push_internal(new typename std::decay::type(std::forward(val)), + PUSH_FRONT, + PUSH_COPY); } //! Return total length -- sum of lengths of all segments @@ -316,7 +344,8 @@ class TypedIndexSet : public TypedIndexSet { size_t total = PARENT::getLength(); size_t num = data.size(); - for (size_t i = 0; i < num; ++i) { + for (size_t i = 0; i < num; ++i) + { total += data[i]->size(); } return total; @@ -339,14 +368,13 @@ class TypedIndexSet : public TypedIndexSet /// RAJA_SUPPRESS_HD_WARN template - RAJA_HOST_DEVICE void segmentCall(size_t segid, - BODY &&body, - ARGS &&... args) const - { - if (getSegmentTypes()[segid] != T0_TypeId) { - PARENT::segmentCall(segid, - std::forward(body), - std::forward(args)...); + RAJA_HOST_DEVICE void + segmentCall(size_t segid, BODY&& body, ARGS&&... args) const + { + if (getSegmentTypes()[segid] != T0_TypeId) + { + PARENT::segmentCall( + segid, std::forward(body), std::forward(args)...); return; } Index_type offset = getSegmentOffsets()[segid]; @@ -356,24 +384,23 @@ class TypedIndexSet : public TypedIndexSet protected: //! Internal logic to add a new segment -- catch invalid type insertion template - RAJA_INLINE void push_internal(Tnew *val, - PushEnd pend = PUSH_BACK, - PushCopy pcopy = PUSH_COPY) + RAJA_INLINE void + push_internal(Tnew* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY) { static_assert(sizeof...(TREST) > 0, "Invalid type for this TypedIndexSet"); PARENT::push_internal(val, pend, pcopy); } //! Internal logic to add a new segment - RAJA_INLINE void push_internal(T0 *val, - PushEnd pend = PUSH_BACK, - PushCopy pcopy = PUSH_COPY) + RAJA_INLINE void + push_internal(T0* val, PushEnd pend = PUSH_BACK, PushCopy pcopy = PUSH_COPY) { data.push_back(val); owner.push_back(pcopy == PUSH_COPY); // Determine if we push at the front or back of the segment list - if (pend == PUSH_BACK) { + if (pend == PUSH_BACK) + { // Store the segment type getSegmentTypes().push_back(T0_TypeId); @@ -384,7 +411,9 @@ class TypedIndexSet : public TypedIndexSet size_t icount = val->size(); getSegmentIcounts().push_back(getTotalLength()); increaseTotalLength(icount); - } else { + } + else + { // Store the segment type getSegmentTypes().push_front(T0_TypeId); @@ -394,7 +423,8 @@ class TypedIndexSet : public TypedIndexSet // Store the segment icount getSegmentIcounts().push_front(0); size_t icount = val->size(); - for (size_t i = 1; i < getSegmentIcounts().size(); ++i) { + for (size_t i = 1; i < getSegmentIcounts().size(); ++i) + { getSegmentIcounts()[i] += icount; } increaseTotalLength(icount); @@ -402,7 +432,7 @@ class TypedIndexSet : public TypedIndexSet } //! Returns the number of indices (the total icount of segments - RAJA_INLINE Index_type &getTotalLength() { return PARENT::getTotalLength(); } + RAJA_INLINE Index_type& getTotalLength() { return PARENT::getTotalLength(); } //! set total length of the indexset RAJA_INLINE void setTotalLength(int n) { return PARENT::setTotalLength(n); } @@ -439,7 +469,8 @@ class TypedIndexSet : public TypedIndexSet int minSeg = RAJA::operators::maximum{}(0, begin); int maxSeg = RAJA::operators::minimum{}(end, getNumSegments()); - for (int i = minSeg; i < maxSeg; ++i) { + for (int i = minSeg; i < maxSeg; ++i) + { segment_push_into(i, retVal, PUSH_BACK, PUSH_NOCOPY); } return retVal; @@ -452,13 +483,15 @@ class TypedIndexSet : public TypedIndexSet /// This TypedIndexSet will not change and the created "slice" into it /// will not own any of its segments. /// - TypedIndexSet createSlice(const int *segIds, int len) + TypedIndexSet createSlice(const int* segIds, int len) { TypedIndexSet retVal; int numSeg = getNumSegments(); - for (int i = 0; i < len; ++i) { - if (segIds[i] >= 0 && segIds[i] < numSeg) { + for (int i = 0; i < len; ++i) + { + if (segIds[i] >= 0 && segIds[i] < numSeg) + { segment_push_into(segIds[i], retVal, PUSH_BACK, PUSH_NOCOPY); } } @@ -476,12 +509,14 @@ class TypedIndexSet : public TypedIndexSet /// iterator type must de-reference to an integral value. /// template - TypedIndexSet createSlice(const T &segIds) + TypedIndexSet createSlice(const T& segIds) { TypedIndexSet retVal; int numSeg = getNumSegments(); - for (auto &seg : segIds) { - if (seg >= 0 && seg < numSeg) { + for (auto& seg : segIds) + { + if (seg >= 0 && seg < numSeg) + { segment_push_into(seg, retVal, PUSH_BACK, PUSH_NOCOPY); } } @@ -509,37 +544,37 @@ class TypedIndexSet : public TypedIndexSet protected: //! Returns the mapping of segment_index -> segment_type - RAJA_INLINE RAJA::RAJAVec &getSegmentTypes() + RAJA_INLINE RAJA::RAJAVec& getSegmentTypes() { return PARENT::getSegmentTypes(); } //! Returns the mapping of segment_index -> segment_type - RAJA_INLINE RAJA::RAJAVec const &getSegmentTypes() const + RAJA_INLINE RAJA::RAJAVec const& getSegmentTypes() const { return PARENT::getSegmentTypes(); } //! Returns the mapping of segment_index -> segment_offset - RAJA_INLINE RAJA::RAJAVec &getSegmentOffsets() + RAJA_INLINE RAJA::RAJAVec& getSegmentOffsets() { return PARENT::getSegmentOffsets(); } //! Returns the mapping of segment_index -> segment_offset - RAJA_INLINE RAJA::RAJAVec const &getSegmentOffsets() const + RAJA_INLINE RAJA::RAJAVec const& getSegmentOffsets() const { return PARENT::getSegmentOffsets(); } //! Returns the icount of segments - RAJA_INLINE RAJA::RAJAVec &getSegmentIcounts() + RAJA_INLINE RAJA::RAJAVec& getSegmentIcounts() { return PARENT::getSegmentIcounts(); } //! Returns the icount of segments - RAJA_INLINE RAJA::RAJAVec const &getSegmentIcounts() const + RAJA_INLINE RAJA::RAJAVec const& getSegmentIcounts() const { return PARENT::getSegmentIcounts(); } @@ -552,13 +587,15 @@ class TypedIndexSet : public TypedIndexSet /// types and indices; e.g., dependency info not checked. /// template - RAJA_INLINE bool operator==(const TypedIndexSet &other) const + RAJA_INLINE bool operator==(const TypedIndexSet& other) const { size_t num_seg = getNumSegments(); if (num_seg != other.getNumSegments()) return false; - for (size_t segid = 0; segid < num_seg; ++segid) { - if (!compareSegmentById(segid, other)) { + for (size_t segid = 0; segid < num_seg; ++segid) + { + if (!compareSegmentById(segid, other)) + { return false; } } @@ -567,14 +604,14 @@ class TypedIndexSet : public TypedIndexSet //! Inequality operator returns true if any segment is not equal, else false. template - RAJA_INLINE bool operator!=(const TypedIndexSet &other) const + RAJA_INLINE bool operator!=(const TypedIndexSet& other) const { return (!(*this == other)); } private: //! vector of TypedIndexSet data objects of type T0 - RAJA::RAJAVec data; + RAJA::RAJAVec data; //! vector indicating which segments are owned by the TypedIndexSet RAJA::RAJAVec owner; @@ -603,7 +640,7 @@ class TypedIndexSet<> //! Copy-constructor. RAJA_INLINE - TypedIndexSet(TypedIndexSet const &c) + TypedIndexSet(TypedIndexSet const& c) { segment_types = c.segment_types; segment_offsets = c.segment_offsets; @@ -612,7 +649,7 @@ class TypedIndexSet<> } //! Swap function for copy-and-swap idiom (deep copy). - void swap(TypedIndexSet &other) + void swap(TypedIndexSet& other) { using std::swap; swap(segment_types, other.segment_types); @@ -625,7 +662,7 @@ class TypedIndexSet<> RAJA_INLINE static size_t getNumTypes() { return 0; } template - RAJA_INLINE constexpr bool isValidSegmentType(T const &) const + RAJA_INLINE constexpr bool isValidSegmentType(T const&) const { // Segment type wasn't found return false; @@ -637,40 +674,39 @@ class TypedIndexSet<> template RAJA_INLINE void segmentCall(size_t, BODY, ARGS...) const - { - } + {} - RAJA_INLINE RAJA::RAJAVec &getSegmentTypes() + RAJA_INLINE RAJA::RAJAVec& getSegmentTypes() { return segment_types; } - RAJA_INLINE RAJA::RAJAVec const &getSegmentTypes() const + RAJA_INLINE RAJA::RAJAVec const& getSegmentTypes() const { return segment_types; } - RAJA_INLINE RAJA::RAJAVec &getSegmentOffsets() + RAJA_INLINE RAJA::RAJAVec& getSegmentOffsets() { return segment_offsets; } - RAJA_INLINE RAJA::RAJAVec const &getSegmentOffsets() const + RAJA_INLINE RAJA::RAJAVec const& getSegmentOffsets() const { return segment_offsets; } - RAJA_INLINE RAJA::RAJAVec &getSegmentIcounts() + RAJA_INLINE RAJA::RAJAVec& getSegmentIcounts() { return segment_icounts; } - RAJA_INLINE RAJA::RAJAVec const &getSegmentIcounts() const + RAJA_INLINE RAJA::RAJAVec const& getSegmentIcounts() const { return segment_icounts; } - RAJA_INLINE Index_type &getTotalLength() { return m_len; } + RAJA_INLINE Index_type& getTotalLength() { return m_len; } RAJA_INLINE void setTotalLength(int n) { m_len = n; } @@ -678,7 +714,7 @@ class TypedIndexSet<> template RAJA_INLINE bool compareSegmentById(size_t, - const TypedIndexSet &) const + const TypedIndexSet&) const { return false; } @@ -690,34 +726,29 @@ class TypedIndexSet<> } template - RAJA_INLINE P0 &getSegment(size_t) + RAJA_INLINE P0& getSegment(size_t) { - return *((P0 *)(this - this)); + return *((P0*)(this - this)); } template - RAJA_INLINE P0 const &getSegment(size_t) const + RAJA_INLINE P0 const& getSegment(size_t) const { - return *((P0 *)(this - this)); + return *((P0*)(this - this)); } template - RAJA_INLINE void push_into(TypedIndexSet &, PushEnd, PushCopy) const - { - } + RAJA_INLINE void push_into(TypedIndexSet&, PushEnd, PushCopy) const + {} template - RAJA_INLINE void segment_push_into(size_t, - TypedIndexSet &, - PushEnd, - PushCopy) const - { - } + RAJA_INLINE void + segment_push_into(size_t, TypedIndexSet&, PushEnd, PushCopy) const + {} template - RAJA_INLINE void push(Tnew const &, PushEnd, PushCopy) - { - } + RAJA_INLINE void push(Tnew const&, PushEnd, PushCopy) + {} public: using iterator = Iterators::numeric_iterator; @@ -762,15 +793,17 @@ namespace type_traits template struct is_index_set - : ::RAJA::type_traits::SpecializationOf::type> { -}; + : ::RAJA::type_traits::SpecializationOf::type> +{}; template struct is_indexset_policy - : ::RAJA::type_traits::SpecializationOf::type> { -}; -} // namespace type_traits + : ::RAJA::type_traits::SpecializationOf::type> +{}; +} // namespace type_traits -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp index 543524be01..cd614cca01 100644 --- a/include/RAJA/index/IndexSetBuilders.hpp +++ b/include/RAJA/index/IndexSetBuilders.hpp @@ -37,13 +37,13 @@ namespace RAJA * \brief Generate an index set with aligned Range segments and List segments, * as needed, from given array of indices. * - * Routine does no error-checking on argements and assumes + * Routine does no error-checking on argements and assumes * RAJA::Index_type array contains valid indices. * - * \param iset reference to index set generated with aligned range segments + * \param iset reference to index set generated with aligned range segments * and list segments. Method assumes index set is empty (no segments). - * \param work_res camp resource object that identifies the memory space in - * which list segment index data will live (passed to list segment + * \param work_res camp resource object that identifies the memory space in + * which list segment index data will live (passed to list segment * ctor). * \param indices_in pointer to start of input array of indices. * \param length size of input index array. @@ -79,37 +79,36 @@ void RAJASHAREDDLL_API buildIndexSetAligned( ****************************************************************************** * * \brief Generate a lock-free "block" index set (planar division) containing - * range segments. + * range segments. * - * The method chunks a fastDim x midDim x slowDim mesh into blocks that + * The method chunks a fastDim x midDim x slowDim mesh into blocks that * can be dependency-scheduled, removing need for lock constructs. * * \param iset reference to index set generated with range segments. - * Method assumes index set is empty (no segments). + * Method assumes index set is empty (no segments). * \param fastDim "fast" block dimension (see above). * \param midDim "mid" block dimension (see above). * \param slowDim "slow" block dimension (see above). * ****************************************************************************** */ -void buildLockFreeBlockIndexset( - RAJA::TypedIndexSet& iset, - int fastDim, - int midDim, - int slowDim); +void buildLockFreeBlockIndexset(RAJA::TypedIndexSet& iset, + int fastDim, + int midDim, + int slowDim); /*! ****************************************************************************** * * \brief Generate a lock-free "color" index set containing range and list * segments. - * - * TThe domain-set is colored based on connectivity to the range-set. - * All elements in each segment are independent, and no two segments + * + * TThe domain-set is colored based on connectivity to the range-set. + * All elements in each segment are independent, and no two segments * can be executed in parallel. * - * \param iset reference to index set generated. Method assumes index set - * is empty (no segments). + * \param iset reference to index set generated. Method assumes index set + * is empty (no segments). * \param work_res camp resource object that identifies the memory space in * which list segment index data will live (passed to list segment * ctor). @@ -126,6 +125,6 @@ void buildLockFreeColorIndexset( RAJA::Index_type* elemPermutation = nullptr, RAJA::Index_type* ielemPermutation = nullptr); -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp index 4baea450fc..eefc0ebbc4 100644 --- a/include/RAJA/index/IndexSetUtils.hpp +++ b/include/RAJA/index/IndexSetUtils.hpp @@ -31,10 +31,10 @@ namespace RAJA //@{ //! @name Methods to gather indices of segment or index set into a container. //! -//! For each method, the given container must be templated on a data type, -//! have default and copy ctors, push_back method, and value_type. Is is -//! assumed that the container data type and segment or index set data type -//! are compatible in the sense that the index set type can be converted to +//! For each method, the given container must be templated on a data type, +//! have default and copy ctors, push_back method, and value_type. Is is +//! assumed that the container data type and segment or index set data type +//! are compatible in the sense that the index set type can be converted to //! the container data type. /*! @@ -49,11 +49,8 @@ RAJA_INLINE void getIndices(CONTAINER_T& con, const TypedIndexSet& iset) { CONTAINER_T tcon; - forall >(iset, - [&](typename CONTAINER_T::value_type idx) { - tcon.push_back(idx); - } - ); + forall>( + iset, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); }); con = tcon; } @@ -68,11 +65,8 @@ template RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg) { CONTAINER_T tcon; - forall(seg, - [&](typename CONTAINER_T::value_type idx) { - tcon.push_back(idx); - } - ); + forall( + seg, [&](typename CONTAINER_T::value_type idx) { tcon.push_back(idx); }); con = tcon; } @@ -90,11 +84,10 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con, CONDITIONAL conditional) { CONTAINER_T tcon; - forall >(iset, - [&](typename CONTAINER_T::value_type idx) { - if (conditional(idx)) tcon.push_back(idx); - } - ); + forall>( + iset, [&](typename CONTAINER_T::value_type idx) { + if (conditional(idx)) tcon.push_back(idx); + }); con = tcon; } @@ -112,16 +105,14 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con, CONDITIONAL conditional) { CONTAINER_T tcon; - forall(seg, - [&](typename CONTAINER_T::value_type idx) { - if (conditional(idx)) tcon.push_back(idx); - } - ); + forall(seg, [&](typename CONTAINER_T::value_type idx) { + if (conditional(idx)) tcon.push_back(idx); + }); con = tcon; } //@} -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp index 44fa143445..8579f2c856 100644 --- a/include/RAJA/index/IndexValue.hpp +++ b/include/RAJA/index/IndexValue.hpp @@ -28,8 +28,8 @@ namespace RAJA { -struct IndexValueBase { -}; +struct IndexValueBase +{}; /*! * \brief Strongly typed "integer" class. @@ -44,16 +44,17 @@ struct IndexValueBase { * Yes, this uses the curiously-recurring template pattern. */ template -struct IndexValue : public IndexValueBase { +struct IndexValue : public IndexValueBase +{ using value_type = VALUE; //! Default constructor initializes value to 0. RAJA_INLINE constexpr IndexValue() = default; - constexpr RAJA_INLINE IndexValue(IndexValue const &) = default; - constexpr RAJA_INLINE IndexValue(IndexValue &&) = default; - RAJA_INLINE IndexValue &operator=(IndexValue const &) = default; - RAJA_INLINE IndexValue &operator=(IndexValue &&) = default; + constexpr RAJA_INLINE IndexValue(IndexValue const&) = default; + constexpr RAJA_INLINE IndexValue(IndexValue&&) = default; + RAJA_INLINE IndexValue& operator=(IndexValue const&) = default; + RAJA_INLINE IndexValue& operator=(IndexValue&&) = default; /*! * \brief Explicit constructor. @@ -61,14 +62,13 @@ struct IndexValue : public IndexValueBase { */ RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit IndexValue(value_type v) : value(v) - { - } + {} //! Dereference provides cast-to-integer. - RAJA_HOST_DEVICE RAJA_INLINE value_type &operator*() { return value; } + RAJA_HOST_DEVICE RAJA_INLINE value_type& operator*() { return value; } //! Dereference provides cast-to-integer. - RAJA_HOST_DEVICE RAJA_INLINE const value_type &operator*() const + RAJA_HOST_DEVICE RAJA_INLINE const value_type& operator*() const { return value; } @@ -82,10 +82,10 @@ struct IndexValue : public IndexValueBase { } //! preincrement stored index - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator++() + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator++() { value++; - return static_cast(*this); + return static_cast(*this); } //! postdecrement -- returns a copy @@ -97,10 +97,10 @@ struct IndexValue : public IndexValueBase { } //! preincrement stored index - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator--() + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator--() { value--; - return static_cast(*this); + return static_cast(*this); } //! addition to underlying index from an Index_type @@ -163,52 +163,52 @@ struct IndexValue : public IndexValueBase { return TYPE(value % a.value); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(value_type x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(value_type x) { value += x; - return static_cast(*this); + return static_cast(*this); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(TYPE x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(TYPE x) { value += x.value; - return static_cast(*this); + return static_cast(*this); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(value_type x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(value_type x) { value -= x; - return static_cast(*this); + return static_cast(*this); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(TYPE x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(TYPE x) { value -= x.value; - return static_cast(*this); + return static_cast(*this); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(value_type x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(value_type x) { value *= x; - return static_cast(*this); + return static_cast(*this); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(TYPE x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(TYPE x) { value *= x.value; - return static_cast(*this); + return static_cast(*this); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(value_type x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(value_type x) { value /= x; - return static_cast(*this); + return static_cast(*this); } - RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(TYPE x) + RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(TYPE x) { value /= x.value; - return static_cast(*this); + return static_cast(*this); } RAJA_HOST_DEVICE RAJA_INLINE bool operator<(value_type x) const @@ -295,7 +295,7 @@ convertIndex_helper(typename FROM::IndexValueType const val) } -} // namespace internal +} // namespace internal /*! * \brief Function provides a way to take either an int or any Index<> type, and @@ -334,16 +334,20 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE return val; } -namespace internal{ -template -struct StripIndexTypeT { - using type = FROM; +namespace internal +{ +template +struct StripIndexTypeT +{ + using type = FROM; }; -template -struct StripIndexTypeT::value>::type> +template +struct StripIndexTypeT< + FROM, + typename std::enable_if::value>::type> { - using type = typename FROM::value_type; + using type = typename FROM::value_type; }; } // namespace internal @@ -353,7 +357,7 @@ struct StripIndexTypeT +template using strip_index_type_t = typename internal::StripIndexTypeT::type; /*! @@ -362,33 +366,31 @@ using strip_index_type_t = typename internal::StripIndexTypeT::type; * * \param FROM the original type */ -template -using make_signed_t = typename std::conditional < - std::is_floating_point::value, - std::common_type, - std::make_signed - >::type::type; +template +using make_signed_t = + typename std::conditional::value, + std::common_type, + std::make_signed>::type::type; -} // namespace RAJA +} // namespace RAJA /*! * \brief Helper Macro to create new Index types. * \param TYPE the name of the type * \param NAME a string literal to identify this index type */ -#define RAJA_INDEX_VALUE(TYPE, NAME) \ - class TYPE : public ::RAJA::IndexValue \ - { \ - using parent = ::RAJA::IndexValue; \ - \ - public: \ - using IndexValueType = TYPE; \ - RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {} \ - RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \ - : parent::IndexValue(v) \ - { \ - } \ - static inline std::string getName() { return NAME; } \ +#define RAJA_INDEX_VALUE(TYPE, NAME) \ + class TYPE : public ::RAJA::IndexValue \ + { \ + using parent = ::RAJA::IndexValue; \ + \ + public: \ + using IndexValueType = TYPE; \ + RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {} \ + RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \ + : parent::IndexValue(v) \ + {} \ + static inline std::string getName() { return NAME; } \ }; /*! @@ -397,17 +399,17 @@ using make_signed_t = typename std::conditional < * \param IDXT the index types value type * \param NAME a string literal to identify this index type */ -#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME) \ - class TYPE : public ::RAJA::IndexValue \ - { \ - public: \ - RAJA_HOST_DEVICE RAJA_INLINE TYPE() \ - : RAJA::IndexValue::IndexValue() {} \ - RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v) \ - : RAJA::IndexValue::IndexValue(v) \ - { \ - } \ - static inline std::string getName() { return NAME; } \ +#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME) \ + class TYPE : public ::RAJA::IndexValue \ + { \ + public: \ + RAJA_HOST_DEVICE RAJA_INLINE TYPE() \ + : RAJA::IndexValue::IndexValue() \ + {} \ + RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v) \ + : RAJA::IndexValue::IndexValue(v) \ + {} \ + static inline std::string getName() { return NAME; } \ }; #endif diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp index adee46053c..ec4da54a1d 100644 --- a/include/RAJA/index/ListSegment.hpp +++ b/include/RAJA/index/ListSegment.hpp @@ -85,7 +85,6 @@ template class TypedListSegment { public: - //@{ //! @name Types used in implementation based on template parameter. @@ -111,7 +110,7 @@ class TypedListSegment * \param values array of indices defining iteration space of segment * \param length number of indices * \param resource camp resource defining memory space where index data live - * \param owned optional enum value indicating whether segment owns indices + * \param owned optional enum value indicating whether segment owns indices * (Owned or Unowned). Default is Owned. * * If 'Unowned' is passed as last argument, the segment will not own its @@ -121,7 +120,7 @@ class TypedListSegment Index_type length, camp::resources::Resource resource, IndexOwnership owned = Owned) - : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0) + : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0) { initIndexData(values, length, resource, owned); } @@ -141,9 +140,13 @@ class TypedListSegment template TypedListSegment(const Container& container, camp::resources::Resource resource) - : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size()) + : m_resource(nullptr), + m_owned(Unowned), + m_data(nullptr), + m_size(container.size()) { - if (m_size > 0) { + if (m_size > 0) + { camp::resources::Resource host_res{camp::resources::Host()}; @@ -152,7 +155,8 @@ class TypedListSegment auto dest = tmp; auto src = container.begin(); auto const end = container.end(); - while (src != end) { + while (src != end) + { *dest = *src; ++dest; ++src; @@ -164,7 +168,6 @@ class TypedListSegment m_owned = Owned; host_res.deallocate(tmp); - } } @@ -175,10 +178,11 @@ class TypedListSegment // As this may be called from a lambda in a // RAJA method we perform a shallow copy RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other) - : m_resource(nullptr), - m_owned(Unowned), m_data(other.m_data), m_size(other.m_size) - { - } + : m_resource(nullptr), + m_owned(Unowned), + m_data(other.m_data), + m_size(other.m_size) + {} //! Copy assignment for list segment // As this may be called from a lambda in a @@ -192,7 +196,7 @@ class TypedListSegment m_size = other.m_size; } - //! move assignment for list segment + //! move assignment for list segment // As this may be called from a lambda in a // RAJA method we perform a shallow copy RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs) @@ -211,8 +215,10 @@ class TypedListSegment //! Move constructor for list segment RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs) - : m_resource(rhs.m_resource), - m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size) + : m_resource(rhs.m_resource), + m_owned(rhs.m_owned), + m_data(rhs.m_data), + m_size(rhs.m_size) { rhs.m_owned = Unowned; rhs.m_resource = nullptr; @@ -221,17 +227,15 @@ class TypedListSegment } //! List segment destructor - RAJA_HOST_DEVICE ~TypedListSegment() - { - clear(); - } + RAJA_HOST_DEVICE ~TypedListSegment() { clear(); } //! Clear method to be called RAJA_HOST_DEVICE void clear() { #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) - if (m_data != nullptr && m_owned == Owned) { + if (m_data != nullptr && m_owned == Owned) + { m_resource->deallocate(m_data); delete m_resource; } @@ -345,7 +349,8 @@ class TypedListSegment { // empty list segment - if (len <= 0 || container == nullptr) { + if (len <= 0 || container == nullptr) + { m_data = nullptr; m_size = 0; m_owned = Unowned; @@ -355,22 +360,24 @@ class TypedListSegment // some non-zero size -- initialize accordingly m_size = len; m_owned = container_own; - if (m_owned == Owned) { + if (m_owned == Owned) + { - m_resource = new camp::resources::Resource(resource_); + m_resource = new camp::resources::Resource(resource_); - camp::resources::Resource host_res{camp::resources::Host()}; + camp::resources::Resource host_res{camp::resources::Host()}; - value_type* tmp = host_res.allocate(m_size); + value_type* tmp = host_res.allocate(m_size); - for (Index_type i = 0; i < m_size; ++i) { - tmp[i] = container[i]; - } + for (Index_type i = 0; i < m_size; ++i) + { + tmp[i] = container[i]; + } - m_data = m_resource->allocate(m_size); - m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size); + m_data = m_resource->allocate(m_size); + m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size); - host_res.deallocate(tmp); + host_res.deallocate(tmp); return; } @@ -382,7 +389,7 @@ class TypedListSegment // Copy of camp resource passed to ctor - camp::resources::Resource *m_resource; + camp::resources::Resource* m_resource; // Ownership flag to guide data copying/management IndexOwnership m_owned; @@ -397,7 +404,7 @@ class TypedListSegment //! Alias for A TypedListSegment using ListSegment = TypedListSegment; -} // namespace RAJA +} // namespace RAJA namespace std { @@ -409,6 +416,6 @@ RAJA_INLINE void swap(RAJA::TypedListSegment& a, { a.swap(b); } -} // namespace std +} // namespace std -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp index a41959c583..3ee1ba3653 100644 --- a/include/RAJA/index/RangeSegment.hpp +++ b/include/RAJA/index/RangeSegment.hpp @@ -50,10 +50,10 @@ namespace RAJA * * NOTE: TypedRangeSegment::iterator is a RandomAccessIterator * - * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of + * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of * indices [-5, 3). * - * NOTE: Proper handling of indices strides requires that StorageT is a + * NOTE: Proper handling of indices strides requires that StorageT is a * signed type. * * Usage: @@ -92,15 +92,22 @@ namespace RAJA * ****************************************************************************** */ -template >> -struct TypedRangeSegment { +template >> +struct TypedRangeSegment +{ - // + // // Static asserts to provide some useful error messages during compilation // for incorrect usage. - // - static_assert(std::is_signed::value, "TypedRangeSegment DiffT requires signed type."); - static_assert(!std::is_floating_point::value, "TypedRangeSegment Type must be non floating point."); + // + static_assert(std::is_signed::value, + "TypedRangeSegment DiffT " + "requires signed type."); + static_assert(!std::is_floating_point::value, + "TypedRangeSegment " + "Type must be non " + "floating point."); //@{ //! @name Types used in implementation based on template parameters. @@ -117,20 +124,19 @@ struct TypedRangeSegment { //@} //@{ - //! @name Constructors, destructor, and copy assignment. + //! @name Constructors, destructor, and copy assignment. /*! * \brief Construct a range segment repreenting the interval [begin, end) - * + * * \param begin start value (inclusive) for the range * \param end end value (exclusive) for the range */ using StripStorageT = strip_index_type_t; - RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end) - : m_begin(iterator(begin)), - m_end(begin > end ? m_begin : iterator(end)) - { - } + RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, + StripStorageT end) + : m_begin(iterator(begin)), m_end(begin > end ? m_begin : iterator(end)) + {} //! Disable compiler generated constructor RAJA_HOST_DEVICE TypedRangeSegment() = delete; @@ -187,7 +193,7 @@ struct TypedRangeSegment { * \brief Compare this segment to another for inequality * * \return true if begin or end does not match, else false - */ + */ RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const { return !(operator==(o)); @@ -198,9 +204,9 @@ struct TypedRangeSegment { /*! * \brief Get a new TypedRangeSegment instance representing a slice of * existing segment - * - * \param begin start iterate of new range - * \param length maximum length of new range + * + * \param begin start iterate of new range + * \param length maximum length of new range * \return TypedRangeSegment representing the interval * [ *begin() + begin, min( *begin() + begin + length, *end() ) ) * @@ -213,7 +219,7 @@ struct TypedRangeSegment { * auto r = RAJA::TypedRangeSegment(-4, 4); * * // s repreents the subinterval [-3, 2) - * auto s = r.slice(1, 5); + * auto s = r.slice(1, 5); * * \endverbatim */ @@ -247,8 +253,8 @@ struct TypedRangeSegment { /*! ****************************************************************************** * - * \class TypedRangeStrideSegment - * + * \class TypedRangeStrideSegment + * * \brief Segment class representing a strided range of typed indices * * \tparam StorageT underlying data type for the segment indices (required) @@ -264,9 +270,9 @@ struct TypedRangeSegment { * * NOTE: TypedRangeStrideSegment::iterator is a RandomAccessIterator * - * NOTE: TypedRangeStrideSegment allows for positive or negative strides and - * indices. This allows for forward (stride > 0) or backward (stride < 0) - * traversal of the iteration space. A stride of zero is undefined and + * NOTE: TypedRangeStrideSegment allows for positive or negative strides and + * indices. This allows for forward (stride > 0) or backward (stride < 0) + * traversal of the iteration space. A stride of zero is undefined and * will cause divide-by-zero errors. * * As with RangeSegment, the iteration space is inclusive of begin() and @@ -275,7 +281,7 @@ struct TypedRangeSegment { * For positive strides, begin() > end() implies size()==0 * For negative strides, begin() < end() implies size()==0 * - * NOTE: Proper handling of negative strides and indices requires that + * NOTE: Proper handling of negative strides and indices requires that * StorageT is a signed type. * * Usage: @@ -321,15 +327,23 @@ struct TypedRangeSegment { * ****************************************************************************** */ -template >> -struct TypedRangeStrideSegment { +template >> +struct TypedRangeStrideSegment +{ // // Static asserts to provide some useful error messages during compilation // for incorrect usage. // - static_assert(std::is_signed::value, "TypedRangeStrideSegment DiffT requires signed type."); - static_assert(!std::is_floating_point::value, "TypedRangeStrideSegment Type must be non floating point."); + static_assert(std::is_signed::value, + "TypedRangeStrideSegment DiffT " + "requires signed type."); + static_assert(!std::is_floating_point::value, + "TypedRangeStrideSegm" + "ent Type must be " + "non floating " + "point."); //@{ //! @name Types used in implementation based on template parameters. @@ -349,7 +363,7 @@ struct TypedRangeStrideSegment { //! @name Constructors, destructor, and copy assignment. /*! - * \brief Construct a range segment for the interval [begin, end) with + * \brief Construct a range segment for the interval [begin, end) with * given stride * * \param begin start value (inclusive) for the range @@ -357,9 +371,8 @@ struct TypedRangeStrideSegment { * \param stride stride value when iterating over the range */ using StripStorageT = strip_index_type_t; - RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin, - StripStorageT end, - DiffT stride) + RAJA_HOST_DEVICE + TypedRangeStrideSegment(StripStorageT begin, StripStorageT end, DiffT stride) : m_begin(iterator(begin, stride)), m_end(iterator(end, stride)), // essentially a ceil((end-begin)/stride) but using integer math, @@ -367,9 +380,12 @@ struct TypedRangeStrideSegment { m_size((end - begin + stride - (stride > 0 ? 1 : -1)) / stride) { // clamp range when end is unreachable from begin without wrapping - if (stride < 0 && end > begin) { + if (stride < 0 && end > begin) + { m_end = m_begin; - } else if (stride > 0 && end < begin) { + } + else if (stride > 0 && end < begin) + { m_end = m_begin; } // m_size initialized as negative indicates a zero iteration space @@ -408,8 +424,8 @@ struct TypedRangeStrideSegment { /*! * \brief Get size of this segment - * - * The size is the number of iterates in the + * + * The size is the number of iterates in the * interval [begin, end) when striding over it */ RAJA_HOST_DEVICE DiffT size() const { return m_size; } @@ -435,7 +451,8 @@ struct TypedRangeStrideSegment { * * \return true if begin, end, or size does not match, else false */ - RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeStrideSegment const& o) const + RAJA_HOST_DEVICE RAJA_INLINE bool + operator!=(TypedRangeStrideSegment const& o) const { return !(operator==(o)); } @@ -450,7 +467,7 @@ struct TypedRangeStrideSegment { * \param length maximum length of new range * * \return TypedRangeStrideSegment representing the interval - * [ *begin() + begin * stride, + * [ *begin() + begin * stride, * min( *begin() + (begin + length) * stride, *end() ) * * Here's an example of a slice operation on a range segment with a negative @@ -466,7 +483,7 @@ struct TypedRangeStrideSegment { * // 5 indices in r starting at the 6th entry * auto s = r.slice(6, 6); * - * \endverbatim + * \endverbatim */ RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin, DiffT length) const @@ -475,15 +492,17 @@ struct TypedRangeStrideSegment { StorageT start = m_begin[0] + begin * stride; StorageT end = start + stride * length; - if (stride > 0) { + if (stride > 0) + { end = end > m_end[0] ? m_end[0] : end; - } else { + } + else + { end = end < m_end[0] ? m_end[0] : end; } - return TypedRangeStrideSegment{stripIndexType(start), - stripIndexType(end), - m_begin.get_stride()}; + return TypedRangeStrideSegment{ + stripIndexType(start), stripIndexType(end), m_begin.get_stride()}; } /*! @@ -518,18 +537,19 @@ namespace detail template struct common_type - : std::common_type::type> { -}; + : std::common_type::type> +{}; template -struct common_type { +struct common_type +{ using type = T; }; template using common_type_t = typename common_type::type; -} // namespace detail +} // namespace detail /*! * \brief Function to make a TypedRangeSegment for the interval [begin, end) @@ -549,7 +569,7 @@ RAJA_HOST_DEVICE TypedRangeSegment make_range(BeginT&& begin, } /*! - * \brief Function to make a TypedRangeStride Segment for the interval + * \brief Function to make a TypedRangeStride Segment for the interval * [begin, end) with given stride * * \return a newly constructed TypedRangeStrideSegment where @@ -561,13 +581,14 @@ template > -RAJA_HOST_DEVICE TypedRangeStrideSegment make_strided_range( - BeginT&& begin, - EndT&& end, - StrideT&& stride) +RAJA_HOST_DEVICE TypedRangeStrideSegment +make_strided_range(BeginT&& begin, EndT&& end, StrideT&& stride) { - static_assert(std::is_signed::value, "make_strided_segment : stride must be signed."); - static_assert(std::is_same, StrideT>::value, "make_stride_segment : stride and end must be of similar types."); + static_assert(std::is_signed::value, + "make_strided_segment : stride must be signed."); + static_assert(std::is_same, StrideT>::value, + "make_stride_segment : stride and end must be of similar " + "types."); return {begin, end, stride}; } @@ -576,15 +597,15 @@ namespace concepts template struct RangeConstructible - : DefineConcept(camp::val>()) { -}; + : DefineConcept(camp::val>()) +{}; template struct RangeStrideConstructible - : DefineConcept(camp::val>()) { -}; + : DefineConcept(camp::val>()) +{}; -} // namespace concepts +} // namespace concepts namespace type_traits { @@ -595,9 +616,9 @@ DefineTypeTraitFromConcept(is_range_constructible, DefineTypeTraitFromConcept(is_range_stride_constructible, RAJA::concepts::RangeStrideConstructible); -} // namespace type_traits +} // namespace type_traits -} // namespace RAJA +} // namespace RAJA namespace std { @@ -618,6 +639,6 @@ RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeStrideSegment& a, a.swap(b); } -} // namespace std +} // namespace std -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp index 8feceae22f..6c9858221a 100644 --- a/include/RAJA/internal/DepGraphNode.hpp +++ b/include/RAJA/internal/DepGraphNode.hpp @@ -57,8 +57,7 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode /// DepGraphNode() : m_num_dep_tasks(0), m_semaphore_reload_value(0), m_semaphore_value(0) - { - } + {} /// /// Get/set semaphore value; i.e., the current number of (unsatisfied) @@ -82,7 +81,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode /// void satisfyOne() { - if (m_semaphore_value > 0) { + if (m_semaphore_value > 0) + { --m_semaphore_value; } } @@ -92,7 +92,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode /// void wait() { - while (m_semaphore_value > 0) { + while (m_semaphore_value > 0) + { // TODO: an efficient wait would be better here, but the standard // promise/future is not good enough std::this_thread::yield(); @@ -124,6 +125,6 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode std::atomic m_semaphore_value; }; -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp index 6f32a56e6d..73628a035b 100644 --- a/include/RAJA/internal/Iterators.hpp +++ b/include/RAJA/internal/Iterators.hpp @@ -50,7 +50,8 @@ std::string overflow_msg(LType lhs, RType rhs) template RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs) { - if (std::is_unsigned::value) { + if (std::is_unsigned::value) + { if ((rhs > 0) && (lhs > std::numeric_limits::max() - rhs)) return true; if ((rhs < 0) && (lhs < std::numeric_limits::min() - rhs)) @@ -64,18 +65,22 @@ RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs, DifferenceType rhs, bool iterator_on_left = true) { - if (iterator_on_left) { + if (iterator_on_left) + { - if (std::is_unsigned::value) { + if (std::is_unsigned::value) + { if ((rhs > 0) && (lhs < std::numeric_limits::min() + rhs)) return true; if ((rhs < 0) && (lhs > std::numeric_limits::max() + rhs)) return true; } + } + else + { // Special case where operation is : value(lhs) - iterator(rhs). - } else { // Special case where operation is : value(lhs) - iterator(rhs). - - if (std::is_unsigned::value) { + if (std::is_unsigned::value) + { if ((lhs > 0) && (rhs < std::numeric_limits::min() + lhs)) return true; if ((lhs < 0)) return true; @@ -121,8 +126,7 @@ class numeric_iterator RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs) : val(rhs) - { - } + {} RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; } @@ -174,8 +178,8 @@ class numeric_iterator return tmp; } - RAJA_HOST_DEVICE inline numeric_iterator& operator+=( - const difference_type& rhs) + RAJA_HOST_DEVICE inline numeric_iterator& + operator+=(const difference_type& rhs) { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_addition_overflow(val, rhs); @@ -183,8 +187,8 @@ class numeric_iterator val += rhs; return *this; } - RAJA_HOST_DEVICE inline numeric_iterator& operator-=( - const difference_type& rhs) + RAJA_HOST_DEVICE inline numeric_iterator& + operator-=(const difference_type& rhs) { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_subtraction_overflow(val, rhs); @@ -192,48 +196,47 @@ class numeric_iterator val -= rhs; return *this; } - RAJA_HOST_DEVICE inline numeric_iterator& operator+=( - const numeric_iterator& rhs) + RAJA_HOST_DEVICE inline numeric_iterator& + operator+=(const numeric_iterator& rhs) { val += rhs.val; return *this; } - RAJA_HOST_DEVICE inline numeric_iterator& operator-=( - const numeric_iterator& rhs) + RAJA_HOST_DEVICE inline numeric_iterator& + operator-=(const numeric_iterator& rhs) { val -= rhs.val; return *this; } - RAJA_HOST_DEVICE inline stripped_value_type operator+( - const numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline stripped_value_type + operator+(const numeric_iterator& rhs) const { return val + rhs.val; } - RAJA_HOST_DEVICE inline stripped_value_type operator-( - const numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline stripped_value_type + operator-(const numeric_iterator& rhs) const { return val - rhs.val; } - RAJA_HOST_DEVICE inline numeric_iterator operator+( - const difference_type& rhs) const + RAJA_HOST_DEVICE inline numeric_iterator + operator+(const difference_type& rhs) const { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_addition_overflow(val, rhs); #endif return numeric_iterator(val + rhs); } - RAJA_HOST_DEVICE inline numeric_iterator operator-( - const difference_type& rhs) const + RAJA_HOST_DEVICE inline numeric_iterator + operator-(const difference_type& rhs) const { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_subtraction_overflow(val, rhs); #endif return numeric_iterator(val - rhs); } - RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+( - difference_type lhs, - const numeric_iterator& rhs) + RAJA_HOST_DEVICE friend constexpr numeric_iterator + operator+(difference_type lhs, const numeric_iterator& rhs) { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) return is_addition_overflow(rhs.val, lhs) @@ -243,9 +246,8 @@ class numeric_iterator return numeric_iterator(lhs + rhs.val); #endif } - RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-( - difference_type lhs, - const numeric_iterator& rhs) + RAJA_HOST_DEVICE friend constexpr numeric_iterator + operator-(difference_type lhs, const numeric_iterator& rhs) { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) return is_subtraction_overflow(rhs.val, lhs, false) @@ -287,17 +289,20 @@ class strided_numeric_iterator using iterator_category = std::random_access_iterator_tag; constexpr strided_numeric_iterator() noexcept = default; - constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = default; - constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = default; - strided_numeric_iterator& operator=(const strided_numeric_iterator&) noexcept = default; - strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept = default; + constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = + default; + constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = + default; + strided_numeric_iterator& + operator=(const strided_numeric_iterator&) noexcept = default; + strided_numeric_iterator& + operator=(strided_numeric_iterator&&) noexcept = default; RAJA_HOST_DEVICE constexpr strided_numeric_iterator( stripped_value_type rhs, DifferenceType stride_ = DifferenceType(1)) : val(rhs), stride(stride_) - { - } + {} RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return stride; } @@ -312,8 +317,8 @@ class strided_numeric_iterator return *this; } - RAJA_HOST_DEVICE inline strided_numeric_iterator& operator+=( - const difference_type& rhs) + RAJA_HOST_DEVICE inline strided_numeric_iterator& + operator+=(const difference_type& rhs) { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_addition_overflow(val, rhs * stride); @@ -321,8 +326,8 @@ class strided_numeric_iterator val += rhs * stride; return *this; } - RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=( - const difference_type& rhs) + RAJA_HOST_DEVICE inline strided_numeric_iterator& + operator-=(const difference_type& rhs) { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_subtraction_overflow(val, rhs * stride); @@ -331,15 +336,15 @@ class strided_numeric_iterator return *this; } - RAJA_HOST_DEVICE inline difference_type operator+( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline difference_type + operator+(const strided_numeric_iterator& rhs) const { return (static_cast(val) + (static_cast(rhs.val))) / stride; } - RAJA_HOST_DEVICE inline difference_type operator-( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline difference_type + operator-(const strided_numeric_iterator& rhs) const { difference_type diff = (static_cast(val) - (static_cast(rhs.val))); @@ -348,16 +353,16 @@ class strided_numeric_iterator ? (difference_type{1} + diff / stride) : diff / stride; } - RAJA_HOST_DEVICE inline strided_numeric_iterator operator+( - const difference_type& rhs) const + RAJA_HOST_DEVICE inline strided_numeric_iterator + operator+(const difference_type& rhs) const { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_addition_overflow(val, rhs * stride); #endif return strided_numeric_iterator(val + rhs * stride, stride); } - RAJA_HOST_DEVICE inline strided_numeric_iterator operator-( - const difference_type& rhs) const + RAJA_HOST_DEVICE inline strided_numeric_iterator + operator-(const difference_type& rhs) const { #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG) check_is_subtraction_overflow(val, rhs * stride); @@ -367,34 +372,34 @@ class strided_numeric_iterator // Specialized comparison to allow normal iteration to work on off-stride // multiples by adjusting rhs to the nearest *higher* multiple of stride - RAJA_HOST_DEVICE inline bool operator!=( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline bool + operator!=(const strided_numeric_iterator& rhs) const { return (val - rhs.val) / stride; } - RAJA_HOST_DEVICE inline bool operator==( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline bool + operator==(const strided_numeric_iterator& rhs) const { return !((val - rhs.val) / stride); } - RAJA_HOST_DEVICE inline bool operator>( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline bool + operator>(const strided_numeric_iterator& rhs) const { return val * stride > rhs.val * stride; } - RAJA_HOST_DEVICE inline bool operator<( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline bool + operator<(const strided_numeric_iterator& rhs) const { return val * stride < rhs.val * stride; } - RAJA_HOST_DEVICE inline bool operator>=( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline bool + operator>=(const strided_numeric_iterator& rhs) const { return val * stride >= rhs.val * stride; } - RAJA_HOST_DEVICE inline bool operator<=( - const strided_numeric_iterator& rhs) const + RAJA_HOST_DEVICE inline bool + operator<=(const strided_numeric_iterator& rhs) const { return val * stride <= rhs.val * stride; } @@ -419,8 +424,8 @@ class strided_numeric_iterator }; -} // namespace Iterators +} // namespace Iterators -} // namespace RAJA +} // namespace RAJA #endif /* RAJA_ITERATORS_HPP */ diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp index 55015f9ab7..e1540c8384 100644 --- a/include/RAJA/internal/MemUtils_CPU.hpp +++ b/include/RAJA/internal/MemUtils_CPU.hpp @@ -27,7 +27,7 @@ #include "RAJA/util/types.hpp" -#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \ +#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \ defined(__MINGW32__) || defined(__BORLANDC__) #define RAJA_PLATFORM_WINDOWS #include @@ -53,10 +53,10 @@ inline void* allocate_aligned(size_t alignment, size_t size) #elif defined(RAJA_PLATFORM_WINDOWS) return _aligned_malloc(size, alignment); #else - char *mem = (char *)malloc(size + alignment + sizeof(void *)); + char* mem = (char*)malloc(size + alignment + sizeof(void*)); if (nullptr == mem) return nullptr; - void **ptr = (void **)((std::uintptr_t)(mem + alignment + sizeof(void *)) & - ~(alignment - 1)); + void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) & + ~(alignment - 1)); // Store the original address one position behind what we give the user. ptr[-1] = mem; return ptr; @@ -97,30 +97,28 @@ inline void free_aligned(void* ptr) /// struct FreeAligned { - void operator()(void* ptr) - { - free_aligned(ptr); - } + void operator()(void* ptr) { free_aligned(ptr); } }; /// /// Deleter function object for memory allocated with allocate_aligned_type /// that calls the destructor for the fist size objects in the storage. /// -template < typename T, typename index_type > +template struct FreeAlignedType : FreeAligned { index_type size = 0; void operator()(T* ptr) { - for ( index_type i = size; i > 0; --i ) { - ptr[i-1].~T(); + for (index_type i = size; i > 0; --i) + { + ptr[i - 1].~T(); } FreeAligned::operator()(ptr); } }; -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp index 1d0ec0cbeb..966a319bbc 100644 --- a/include/RAJA/internal/RAJAVec.hpp +++ b/include/RAJA/internal/RAJAVec.hpp @@ -49,7 +49,7 @@ namespace RAJA * ****************************************************************************** */ -template > +template > class RAJAVec { using allocator_traits_type = std::allocator_traits; @@ -57,8 +57,9 @@ class RAJAVec typename allocator_traits_type::propagate_on_container_copy_assignment; using propagate_on_container_move_assignment = typename allocator_traits_type::propagate_on_container_move_assignment; - using propagate_on_container_swap = + using propagate_on_container_swap = typename allocator_traits_type::propagate_on_container_swap; + public: using value_type = T; using allocator_type = Allocator; @@ -86,7 +87,9 @@ class RAJAVec /// RAJAVec(const RAJAVec& other) : m_data(nullptr), - m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)), + m_allocator( + allocator_traits_type::select_on_container_copy_construction( + other.m_allocator)), m_capacity(0), m_size(0) { @@ -113,7 +116,8 @@ class RAJAVec /// RAJAVec& operator=(const RAJAVec& rhs) { - if (&rhs != this) { + if (&rhs != this) + { copy_assign_private(rhs, propagate_on_container_copy_assignment{}); } return *this; @@ -124,8 +128,10 @@ class RAJAVec /// RAJAVec& operator=(RAJAVec&& rhs) { - if (&rhs != this) { - move_assign_private(std::move(rhs), propagate_on_container_move_assignment{}); + if (&rhs != this) + { + move_assign_private(std::move(rhs), + propagate_on_container_move_assignment{}); } return *this; } @@ -150,25 +156,25 @@ class RAJAVec /// /// Get a pointer to the beginning of the contiguous vector /// - pointer data() { return m_data; } + pointer data() { return m_data; } /// const_pointer data() const { return m_data; } /// /// Get an iterator to the end. /// - iterator end() { return m_data + m_size; } + iterator end() { return m_data + m_size; } /// - const_iterator end() const { return m_data + m_size; } + const_iterator end() const { return m_data + m_size; } /// const_iterator cend() const { return m_data + m_size; } /// /// Get an iterator to the beginning. /// - iterator begin() { return m_data; } + iterator begin() { return m_data; } /// - const_iterator begin() const { return m_data; } + const_iterator begin() const { return m_data; } /// const_iterator cbegin() const { return m_data; } @@ -200,18 +206,12 @@ class RAJAVec /// /// Shrink the capacity of the vector to the current size. /// - void shrink_to_fit() - { - shrink_cap(m_size); - } + void shrink_to_fit() { shrink_cap(m_size); } /// /// Empty vector of all data. /// - void clear() - { - destroy_items_after(0); - } + void clear() { destroy_items_after(0); } /// /// Change the size of the vector, @@ -221,10 +221,13 @@ class RAJAVec RAJA_INLINE void resize(size_type new_size) { - if (new_size >= size()) { + if (new_size >= size()) + { reserve(new_size); construct_items_back(new_size); - } else { + } + else + { destroy_items_after(new_size); } } @@ -237,10 +240,13 @@ class RAJAVec RAJA_INLINE void resize(size_type new_size, const_reference new_value) { - if (new_size >= size()) { + if (new_size >= size()) + { reserve(new_size); construct_items_back(new_size, new_value); - } else { + } + else + { destroy_items_after(new_size); } } @@ -248,23 +254,23 @@ class RAJAVec /// /// Bracket operator accessor. /// - reference operator[](difference_type i) { return m_data[i]; } + reference operator[](difference_type i) { return m_data[i]; } /// const_reference operator[](difference_type i) const { return m_data[i]; } /// /// Access the last item of the vector. /// - reference front() { return m_data[0]; } + reference front() { return m_data[0]; } /// const_reference front() const { return m_data[0]; } /// /// Access the last item of the vector. /// - reference back() { return m_data[m_size-1]; } + reference back() { return m_data[m_size - 1]; } /// - const_reference back() const { return m_data[m_size-1]; } + const_reference back() const { return m_data[m_size - 1]; } /// /// Add item to front end of vector. Note that this operation is unique to @@ -272,28 +278,31 @@ class RAJAVec /// void push_front(const_reference item) { emplace_front_private(item); } /// - void push_front( value_type&& item) { emplace_front_private(std::move(item)); } + void push_front(value_type&& item) { emplace_front_private(std::move(item)); } /// - template < typename ... Os > - void emplace_front(Os&&... os) { emplace_front_private(std::forward(os)...); } + template + void emplace_front(Os&&... os) + { + emplace_front_private(std::forward(os)...); + } /// /// Add item to back end of vector. /// void push_back(const_reference item) { emplace_back_private(item); } /// - void push_back( value_type&& item) { emplace_back_private(std::move(item)); } + void push_back(value_type&& item) { emplace_back_private(std::move(item)); } /// - template < typename ... Os > - void emplace_back(Os&&... os) { emplace_back_private(std::forward(os)...); } + template + void emplace_back(Os&&... os) + { + emplace_back_private(std::forward(os)...); + } /// /// Remove the last item of the vector. /// - void pop_back() - { - destroy_items_after(m_size-1); - } + void pop_back() { destroy_items_after(m_size - 1); } private: pointer m_data; @@ -307,7 +316,8 @@ class RAJAVec /// void copy_assign_private(RAJAVec const& rhs, std::true_type) { - if (m_allocator != rhs.m_allocator) { + if (m_allocator != rhs.m_allocator) + { clear(); shrink_to_fit(); m_allocator = rhs.m_allocator; @@ -323,10 +333,13 @@ class RAJAVec void copy_assign_private(RAJAVec const& rhs, std::false_type) { reserve(rhs.size()); - if (size() < rhs.size()) { + if (size() < rhs.size()) + { copy_assign_items(0, size(), rhs.data()); copy_construct_items_back(rhs.size(), rhs.data()); - } else { + } + else + { copy_assign_items(0, rhs.size(), rhs.data()); destroy_items_after(size()); } @@ -357,7 +370,8 @@ class RAJAVec /// void move_assign_private(RAJAVec&& rhs, std::false_type) { - if (m_allocator == rhs.m_allocator) { + if (m_allocator == rhs.m_allocator) + { clear(); shrink_to_fit(); @@ -368,12 +382,17 @@ class RAJAVec rhs.m_data = nullptr; rhs.m_capacity = 0; rhs.m_size = 0; - } else { + } + else + { reserve(rhs.size()); - if (size() < rhs.size()) { + if (size() < rhs.size()) + { move_assign_items(0, size(), rhs.data()); move_construct_items_back(rhs.size(), rhs.data()); - } else { + } + else + { move_assign_items(0, rhs.size(), rhs.data()); destroy_items_after(size()); } @@ -386,10 +405,10 @@ class RAJAVec void swap_private(RAJAVec& other, std::true_type) { using std::swap; - swap(m_data, other.m_data); + swap(m_data, other.m_data); swap(m_allocator, other.m_allocator); - swap(m_capacity, other.m_capacity); - swap(m_size, other.m_size); + swap(m_capacity, other.m_capacity); + swap(m_size, other.m_size); } /// @@ -398,9 +417,9 @@ class RAJAVec void swap_private(RAJAVec& other, std::false_type) { using std::swap; - swap(m_data, other.m_data); - swap(m_capacity, other.m_capacity); - swap(m_size, other.m_size); + swap(m_data, other.m_data); + swap(m_capacity, other.m_capacity); + swap(m_size, other.m_size); } // @@ -408,7 +427,8 @@ class RAJAVec // void copy_assign_items(size_type first, size_type last, const_pointer o_data) { - for (size_type i = first; i < last; ++i) { + for (size_type i = first; i < last; ++i) + { m_data[i] = o_data[i]; } } @@ -418,7 +438,8 @@ class RAJAVec // void move_assign_items(size_type first, size_type last, pointer o_data) { - for (size_type i = first; i < last; ++i) { + for (size_type i = first; i < last; ++i) + { m_data[i] = std::move(o_data[i]); } } @@ -426,11 +447,13 @@ class RAJAVec // // Construct items [m_size, new_size) from args. // - template < typename ... Os > + template void construct_items_back(size_type new_size, Os&&... os) { - for (; m_size < new_size; ++m_size) { - allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward(os)...); + for (; m_size < new_size; ++m_size) + { + allocator_traits_type::construct( + m_allocator, m_data + m_size, std::forward(os)...); } } @@ -439,8 +462,10 @@ class RAJAVec // void copy_construct_items_back(size_type new_size, const_pointer o_data) { - for (; m_size < new_size; ++m_size) { - allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]); + for (; m_size < new_size; ++m_size) + { + allocator_traits_type::construct( + m_allocator, m_data + m_size, o_data[m_size]); } } @@ -449,8 +474,10 @@ class RAJAVec // void move_construct_items_back(size_type new_size, pointer o_data) { - for (; m_size < new_size; ++m_size) { - allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size])); + for (; m_size < new_size; ++m_size) + { + allocator_traits_type::construct( + m_allocator, m_data + m_size, std::move(o_data[m_size])); } } @@ -459,39 +486,45 @@ class RAJAVec // void destroy_items_after(size_type new_end) { - for (; m_size > new_end; --m_size) { - allocator_traits_type::destroy(m_allocator, m_data+m_size-1); + for (; m_size > new_end; --m_size) + { + allocator_traits_type::destroy(m_allocator, m_data + m_size - 1); } } // // Add an item to the front, shifting all existing items back one. // - template < typename ... Os > + template void emplace_front_private(Os&&... os) { reserve(m_size + 1); - if (m_size > 0) { + if (m_size > 0) + { size_type i = m_size; - allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1])); - for (--i; i > 0; --i) { + allocator_traits_type::construct( + m_allocator, m_data + i, std::move(m_data[i - 1])); + for (--i; i > 0; --i) + { m_data[i] = std::move(m_data[i - 1]); } allocator_traits_type::destroy(m_allocator, m_data); } - allocator_traits_type::construct(m_allocator, m_data, std::forward(os)...); + allocator_traits_type::construct( + m_allocator, m_data, std::forward(os)...); m_size++; } // // Add an item to the back. // - template < typename ... Os > + template void emplace_back_private(Os&&... os) { reserve(m_size + 1); - allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward(os)...); + allocator_traits_type::construct( + m_allocator, m_data + m_size, std::forward(os)...); m_size++; } @@ -509,7 +542,8 @@ class RAJAVec size_type get_next_cap(size_type target_size) { size_type next_cap = s_init_cap; - if (m_capacity != 0) { + if (m_capacity != 0) + { next_cap = static_cast(m_capacity * s_grow_fac); } return std::max(target_size, next_cap); @@ -520,7 +554,8 @@ class RAJAVec // void grow_cap(size_type target_size) { - if (m_capacity < target_size) { + if (m_capacity < target_size) + { change_cap(get_next_cap(target_size)); } } @@ -530,7 +565,8 @@ class RAJAVec // void shrink_cap(size_type target_size) { - if (m_capacity > target_size) { + if (m_capacity > target_size) + { change_cap(std::max(m_size, target_size)); } } @@ -542,14 +578,18 @@ class RAJAVec void change_cap(size_type next_cap) { pointer tdata = nullptr; - if (next_cap != 0) { + if (next_cap != 0) + { tdata = allocator_traits_type::allocate(m_allocator, next_cap); } - if (m_data) { - for (size_type i = 0; i < m_size; ++i) { - allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i])); - allocator_traits_type::destroy(m_allocator, m_data+i); + if (m_data) + { + for (size_type i = 0; i < m_size; ++i) + { + allocator_traits_type::construct( + m_allocator, tdata + i, std::move(m_data[i])); + allocator_traits_type::destroy(m_allocator, m_data + i); } allocator_traits_type::deallocate(m_allocator, m_data, m_capacity); } @@ -559,6 +599,6 @@ class RAJAVec } }; -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/internal/ThreadUtils_CPU.hpp b/include/RAJA/internal/ThreadUtils_CPU.hpp index addd22c4f7..c83905ea77 100644 --- a/include/RAJA/internal/ThreadUtils_CPU.hpp +++ b/include/RAJA/internal/ThreadUtils_CPU.hpp @@ -47,6 +47,6 @@ int getMaxOMPThreadsCPU() return nthreads; } -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp index cf3a86cede..da72005702 100644 --- a/include/RAJA/internal/fault_tolerance.hpp +++ b/include/RAJA/internal/fault_tolerance.hpp @@ -37,62 +37,74 @@ #include #include "cycle.h" -#define RAJA_FT_BEGIN \ - extern volatile int fault_type; \ - bool repeat; \ - bool do_time = false; \ - ticks start = 0, stop = 0; \ - if (fault_type != 0) { \ - printf("Uncaught fault %d\n", fault_type); \ - fault_type = 0; \ - } \ - do { \ - repeat = false; \ - if (do_time) { \ - start = getticks(); \ +#define RAJA_FT_BEGIN \ + extern volatile int fault_type; \ + bool repeat; \ + bool do_time = false; \ + ticks start = 0, stop = 0; \ + if (fault_type != 0) \ + { \ + printf("Uncaught fault %d\n", fault_type); \ + fault_type = 0; \ + } \ + do \ + { \ + repeat = false; \ + if (do_time) \ + { \ + start = getticks(); \ } -#define RAJA_FT_END \ - if (do_time) { \ - stop = getticks(); \ - printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start)); \ - do_time = false; \ - fault_type = 0; \ - } \ - if (fault_type < 0) { \ - printf("Unrecoverable fault (restart penalty)\n"); \ - fault_type = 0; \ - } \ - if (fault_type > 0) { \ - /* invalidate cache */ \ - repeat = true; \ - do_time = true; \ - } \ - } \ - while (repeat == true) \ +#define RAJA_FT_END \ + if (do_time) \ + { \ + stop = getticks(); \ + printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start)); \ + do_time = false; \ + fault_type = 0; \ + } \ + if (fault_type < 0) \ + { \ + printf("Unrecoverable fault (restart penalty)\n"); \ + fault_type = 0; \ + } \ + if (fault_type > 0) \ + { \ + /* invalidate cache */ \ + repeat = true; \ + do_time = true; \ + } \ + } \ + while (repeat == true) \ ; #else -#define RAJA_FT_BEGIN \ - extern volatile int fault_type; \ - bool repeat; \ - if (fault_type == 0) { \ - do { \ +#define RAJA_FT_BEGIN \ + extern volatile int fault_type; \ + bool repeat; \ + if (fault_type == 0) \ + { \ + do \ + { \ repeat = false; -#define RAJA_FT_END \ - if (fault_type > 0) { \ - /* invalidate cache */ \ - repeat = true; \ - fault_type = 0; \ - } \ - } \ - while (repeat == true) \ - ; \ - } \ - else { fault_type = 0; /* ignore for the simulation */ } - -#endif // RAJA_REPORT_FT +#define RAJA_FT_END \ + if (fault_type > 0) \ + { \ + /* invalidate cache */ \ + repeat = true; \ + fault_type = 0; \ + } \ + } \ + while (repeat == true) \ + ; \ + } \ + else \ + { \ + fault_type = 0; /* ignore for the simulation */ \ + } + +#endif // RAJA_REPORT_FT #else @@ -100,6 +112,6 @@ #define RAJA_FT_END -#endif // RAJA_ENABLE_FT +#endif // RAJA_ENABLE_FT -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp index af65c05392..3e9e0bc15a 100644 --- a/include/RAJA/internal/foldl.hpp +++ b/include/RAJA/internal/foldl.hpp @@ -44,14 +44,16 @@ template struct foldl_impl; template -struct foldl_impl { +struct foldl_impl +{ using Ret = Arg1; }; #if RAJA_HAS_CXX17_IS_INVOCABLE template -struct foldl_impl { +struct foldl_impl +{ using Ret = typename std::invoke_result::type; }; @@ -60,18 +62,22 @@ template -struct foldl_impl { - using Ret = typename foldl_impl< - Op, - typename std::invoke_result::type, - Arg3>::type, - Rest...>::Ret; +struct foldl_impl +{ + using Ret = + typename foldl_impl::type, + Arg3>::type, + Rest...>::Ret; }; #else template -struct foldl_impl { +struct foldl_impl +{ using Ret = typename std::result_of::type; }; @@ -80,7 +86,8 @@ template -struct foldl_impl { +struct foldl_impl +{ using Ret = typename foldl_impl< Op, typename std::result_of::type, @@ -93,17 +100,16 @@ struct foldl_impl { } // namespace detail template -RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl( - Op&& RAJA_UNUSED_ARG(operation), - Arg1&& arg) -> typename detail::foldl_impl::Ret +RAJA_HOST_DEVICE RAJA_INLINE constexpr auto +foldl(Op&& RAJA_UNUSED_ARG(operation), Arg1&& arg) -> + typename detail::foldl_impl::Ret { return camp::forward(arg); } template -RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation, - Arg1&& arg1, - Arg2&& arg2) -> +RAJA_HOST_DEVICE RAJA_INLINE constexpr auto +foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2) -> typename detail::foldl_impl::Ret { return camp::forward(operation)(camp::forward(arg1), @@ -115,11 +121,8 @@ template -RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation, - Arg1&& arg1, - Arg2&& arg2, - Arg3&& arg3, - Rest&&... rest) -> +RAJA_HOST_DEVICE RAJA_INLINE constexpr auto +foldl(Op&& operation, Arg1&& arg1, Arg2&& arg2, Arg3&& arg3, Rest&&... rest) -> typename detail::foldl_impl::Ret { return foldl(camp::forward(operation), @@ -157,6 +160,6 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args) } -} // namespace RAJA +} // namespace RAJA #endif diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp index 0354d04bfd..d32344629c 100644 --- a/include/RAJA/internal/get_platform.hpp +++ b/include/RAJA/internal/get_platform.hpp @@ -8,18 +8,21 @@ namespace RAJA { -namespace policy { -namespace multi { +namespace policy +{ +namespace multi +{ template class MultiPolicy; } -} +} // namespace policy -namespace detail +namespace detail { -struct max_platform { +struct max_platform +{ RAJA_HOST_DEVICE RAJA_INLINE constexpr RAJA::Platform operator()(const RAJA::Platform& l, @@ -34,7 +37,8 @@ struct max_platform { * This is a catch-all, so anything undefined gets Platform::undefined */ template -struct get_platform { +struct get_platform +{ // catch-all: undefined platform static constexpr Platform value = Platform::undefined; }; @@ -45,7 +49,8 @@ struct get_platform { * reduction of them all. */ template -struct get_platform_from_list { +struct get_platform_from_list +{ static constexpr Platform value = foldl(max_platform(), get_platform::value...); }; @@ -54,7 +59,8 @@ struct get_platform_from_list { * Define an empty list as Platform::undefined; */ template <> -struct get_platform_from_list<> { +struct get_platform_from_list<> +{ static constexpr Platform value = Platform::undefined; }; @@ -67,10 +73,10 @@ struct get_platform_from_list<> { */ template struct get_platform::value - && !RAJA::type_traits::is_indexset_policy:: - value>::type> { + typename std::enable_if< + std::is_base_of::value && + !RAJA::type_traits::is_indexset_policy::value>::type> +{ static constexpr Platform value = T::platform; }; @@ -83,12 +89,13 @@ struct get_platform struct get_platform> - : public get_platform_from_list { -}; + : public get_platform_from_list +{}; template -struct get_statement_platform { +struct get_statement_platform +{ static constexpr Platform value = get_platform_from_list::value; @@ -102,7 +109,8 @@ struct get_statement_platform { * each of them. */ template -struct get_platform> { +struct get_platform> +{ static constexpr Platform value = foldl(max_platform(), get_statement_platform::value...); }; @@ -111,7 +119,8 @@ struct get_platform> { * Specialize for an empty statement list to be undefined */ template <> -struct get_platform> { +struct get_platform> +{ static constexpr Platform value = Platform::undefined; }; @@ -120,11 +129,12 @@ struct get_platform> { // Once a specific policy is selected, that policy will select the correct // platform... see policy_invoker in MultiPolicy.hpp template -struct get_platform> { +struct get_platform> +{ static constexpr Platform value = Platform::undefined; }; -} // closing brace for detail namespace -} // closing brace for RAJA namespace +} // namespace detail +} // namespace RAJA #endif // RAJA_get_platform_HPP diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp index 767821b8d8..b3e50fea8e 100644 --- a/include/RAJA/pattern/WorkGroup.hpp +++ b/include/RAJA/pattern/WorkGroup.hpp @@ -38,38 +38,44 @@ namespace RAJA * * \verbatim - WorkPool, Allocator> pool(allocator); + WorkPool, Allocator> + pool(allocator); pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) { xarg0[i] = xarg1; }); - WorkGroup, Allocator> group = pool.instantiate(); + WorkGroup, Allocator> group = + pool.instantiate(); int* xarg0 = ...; int xarg1 = ...; - WorkSite, Allocator> site = group.run(xarg0, xarg1); + WorkSite, Allocator> site = + group.run(xarg0, xarg1); * \endverbatim * ****************************************************************************** */ -template < typename ... Args > +template using xargs = camp::list; -namespace detail { +namespace detail +{ -template < typename T > -struct is_xargs { +template +struct is_xargs +{ static constexpr bool value = false; }; -template < typename ... Args > -struct is_xargs> { +template +struct is_xargs> +{ static constexpr bool value = true; }; -} +} // namespace detail // @@ -102,7 +108,8 @@ struct is_xargs> { data[i] = 1; }); - WorkGroup, Allocator> group = pool.instantiate(); + WorkGroup, Allocator> group = + pool.instantiate(); * \endverbatim * @@ -112,11 +119,15 @@ template -struct WorkPool { - static_assert(RAJA::pattern_is::value, +struct WorkPool +{ + static_assert( + RAJA::pattern_is::value, "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy"); static_assert(detail::is_xargs::value, - "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type"); + "WorkPool: EXTRA_ARGS_T " + "must be a " + "RAJA::xargs<...> type"); }; /*! @@ -135,9 +146,11 @@ struct WorkPool { * * \verbatim - WorkGroup, Allocator> group = pool.instantiate(); + WorkGroup, Allocator> group = + pool.instantiate(); - WorkSite, Allocator> site = group.run(); + WorkSite, Allocator> site = + group.run(); * \endverbatim * @@ -147,11 +160,15 @@ template -struct WorkGroup { - static_assert(RAJA::pattern_is::value, +struct WorkGroup +{ + static_assert( + RAJA::pattern_is::value, "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy"); static_assert(detail::is_xargs::value, - "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type"); + "WorkGroup: " + "EXTRA_ARGS_T must be a " + "RAJA::xargs<...> type"); }; /*! @@ -170,7 +187,8 @@ struct WorkGroup { * * \verbatim - WorkSite, Allocator> site = group.run(); + WorkSite, Allocator> site = + group.run(); site.synchronize(); @@ -182,11 +200,15 @@ template -struct WorkSite { - static_assert(RAJA::pattern_is::value, +struct WorkSite +{ + static_assert( + RAJA::pattern_is::value, "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy"); static_assert(detail::is_xargs::value, - "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type"); + "WorkSite: EXTRA_ARGS_T " + "must be a " + "RAJA::xargs<...> type"); }; @@ -195,7 +217,7 @@ template struct WorkPool; + using policy = WorkGroupPolicy; using index_type = INDEX_T; using xarg_type = xargs; using Allocator = ALLOCATOR_T; @@ -218,10 +243,16 @@ struct WorkPool; private: - using workrunner_type = detail::WorkRunner< - exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>; - using storage_type = detail::WorkStorage< - storage_policy, Allocator, typename workrunner_type::dispatcher_type>; + using workrunner_type = detail::WorkRunner; + using storage_type = + detail::WorkStorage; friend workgroup_type; friend worksite_type; @@ -229,9 +260,7 @@ struct WorkPool + template inline void enqueue(segment_T&& seg, loop_T&& loop_body) { { // ignore zero length loops - using std::begin; using std::end; + using std::begin; + using std::end; if (begin(seg) == end(seg)) return; } - if (m_storage.begin() == m_storage.end()) { + if (m_storage.begin() == m_storage.end()) + { // perform auto-reserve on reuse reserve(m_max_num_loops, m_max_storage_bytes); } @@ -273,8 +298,7 @@ struct WorkPool(seg), std::move(body)); + m_runner.enqueue(m_storage, std::forward(seg), std::move(body)); util::callPostCapturePlugins(context); } @@ -289,10 +313,7 @@ struct WorkPool struct WorkGroup; + using policy = WorkGroupPolicy; using index_type = INDEX_T; using xarg_type = xargs; using Allocator = ALLOCATOR_T; @@ -347,7 +371,8 @@ struct WorkGroup struct WorkSite; + using policy = WorkGroupPolicy; using index_type = INDEX_T; using xarg_type = xargs; using Allocator = ALLOCATOR_T; @@ -418,10 +442,7 @@ struct WorkSite -inline -typename WorkPool< - WorkGroupPolicy, - INDEX_T, - xargs, - ALLOCATOR_T>::workgroup_type -WorkPool< - WorkGroupPolicy, - INDEX_T, - xargs, - ALLOCATOR_T>::instantiate() +inline typename WorkPool, + INDEX_T, + xargs, + ALLOCATOR_T>::workgroup_type +WorkPool, + INDEX_T, + xargs, + ALLOCATOR_T>::instantiate() { // update max sizes to auto-reserve on reuse m_max_num_loops = std::max(m_storage.size(), m_max_num_loops); @@ -477,36 +497,43 @@ template -inline -typename WorkGroup< - WorkGroupPolicy, - INDEX_T, - xargs, - ALLOCATOR_T>::worksite_type +inline typename WorkGroup, + INDEX_T, + xargs, + ALLOCATOR_T>::worksite_type WorkGroup< - WorkGroupPolicy, + WorkGroupPolicy, INDEX_T, xargs, - ALLOCATOR_T>::run(typename WorkGroup< - WorkGroupPolicy, - INDEX_T, - xargs, - ALLOCATOR_T>::resource_type r, + ALLOCATOR_T>::run(typename WorkGroup, + INDEX_T, + xargs, + ALLOCATOR_T>::resource_type r, Args... args) { util::PluginContext context{util::make_context()}; util::callPreLaunchPlugins(context); // move any per run storage into worksite - worksite_type site(r, m_runner.run(m_storage, r, std::forward(args)...)); + worksite_type site(r, + m_runner.run(m_storage, r, std::forward(args)...)); util::callPostLaunchPlugins(context); return site; } -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp index 1eac283f4b..954e59b9af 100644 --- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp +++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp @@ -36,35 +36,36 @@ namespace RAJA namespace detail { -template < typename > +template struct DispatcherVoidPtrWrapper { void* ptr; DispatcherVoidPtrWrapper() = default; // implicit constructor from void* - RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { } + RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) {} }; -template < typename > +template struct DispatcherVoidConstPtrWrapper { const void* ptr; DispatcherVoidConstPtrWrapper() = default; // implicit constructor from const void* - RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { } + RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) {} }; -constexpr bool dispatcher_use_host_invoke(Platform platform) { +constexpr bool dispatcher_use_host_invoke(Platform platform) +{ return !(platform == Platform::cuda || platform == Platform::hip); } // Transforms one dispatch policy into another by creating a dispatch policy // of holder_type objects. See usage in WorkRunner for more explanation. -template < typename dispatch_policy, typename holder_type > +template struct dispatcher_transform_types; /// -template < typename dispatch_policy, typename holder_type > +template using dispatcher_transform_types_t = typename dispatcher_transform_types::type; @@ -75,12 +76,17 @@ using dispatcher_transform_types_t = * DispatcherID is used to differentiate function pointers based on their * function signature. */ -template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs > +template struct Dispatcher; -template < typename holder_type > -struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> { +template +struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, + holder_type> +{ using type = ::RAJA::indirect_function_call_dispatch; }; @@ -93,8 +99,12 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holde * during device linking when functions with high register counts may cause * device linking to fail. */ -template < Platform platform, typename DispatcherID, typename ... CallArgs > -struct Dispatcher { +template +struct Dispatcher +{ static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); using dispatch_policy = ::RAJA::indirect_function_call_dispatch; using void_ptr_wrapper = DispatcherVoidPtrWrapper; @@ -104,27 +114,29 @@ struct Dispatcher - static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) + template + static void s_move_construct_destroy(void_ptr_wrapper dest, + void_ptr_wrapper src) { T* dest_as_T = static_cast(dest.ptr); T* src_as_T = static_cast(src.ptr); - new(dest_as_T) T(std::move(*src_as_T)); + new (dest_as_T) T(std::move(*src_as_T)); (*src_as_T).~T(); } /// /// invoke the call operator of the object of type T in obj with args /// - template < typename T > + template static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args) { const T* obj_as_T = static_cast(obj.ptr); (*obj_as_T)(std::forward(args)...); } /// - template < typename T > - static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args) + template + static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, + CallArgs... args) { const T* obj_as_T = static_cast(obj.ptr); (*obj_as_T)(std::forward(args)...); @@ -133,22 +145,26 @@ struct Dispatcher + template static void s_destroy(void_ptr_wrapper obj) { T* obj_as_T = static_cast(obj.ptr); (*obj_as_T).~T(); } - using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/); - using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/); - using destroyer_type = void(*)(void_ptr_wrapper /*obj*/); + using mover_type = void (*)(void_ptr_wrapper /*dest*/, + void_ptr_wrapper /*src*/); + using invoker_type = void (*)(void_cptr_wrapper /*obj*/, + CallArgs... /*args*/); + using destroyer_type = void (*)(void_ptr_wrapper /*obj*/); // This can't be a cuda device lambda due to compiler limitations - template < typename T > - struct DeviceInvokerFactory { + template + struct DeviceInvokerFactory + { using value_type = invoker_type; - RAJA_DEVICE value_type operator()() { + RAJA_DEVICE value_type operator()() + { #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) return nullptr; #else @@ -160,14 +176,15 @@ struct Dispatcher* = nullptr > - static inline Dispatcher makeDispatcher() { - return { mover_type{&s_move_construct_destroy}, - invoker_type{&s_host_invoke}, - destroyer_type{&s_destroy}, - sizeof(T) - }; + template * = nullptr> + static inline Dispatcher makeDispatcher() + { + return {mover_type{&s_move_construct_destroy}, + invoker_type{&s_host_invoke}, + destroyer_type{&s_destroy}, + sizeof(T)}; } /// /// create a Dispatcher that can be used on the device for objects of type T @@ -179,14 +196,17 @@ struct Dispatcher* = nullptr > - static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) { - return { mover_type{&s_move_construct_destroy}, - invoker_type{std::forward(createOnDevice)(DeviceInvokerFactory{})}, - destroyer_type{&s_destroy}, - sizeof(T) - }; + template * = nullptr> + static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) + { + return {mover_type{&s_move_construct_destroy}, + invoker_type{std::forward(createOnDevice)( + DeviceInvokerFactory{})}, + destroyer_type{&s_destroy}, + sizeof(T)}; } mover_type move_construct_destroy; @@ -196,8 +216,10 @@ struct Dispatcher -struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> { +template +struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, + holder_type> +{ using type = ::RAJA::indirect_virtual_function_dispatch; }; @@ -210,38 +232,48 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, ho * during device linking when functions with high register counts may cause * device linking to fail. */ -template < Platform platform, typename DispatcherID, typename ... CallArgs > -struct Dispatcher { +template +struct Dispatcher +{ static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch; using void_ptr_wrapper = DispatcherVoidPtrWrapper; using void_cptr_wrapper = DispatcherVoidConstPtrWrapper; - struct impl_base { - virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0; + struct impl_base + { + virtual void move_destroy(void_ptr_wrapper dest, + void_ptr_wrapper src) const = 0; virtual void destroy(void_ptr_wrapper obj) const = 0; }; - struct host_impl_base { + struct host_impl_base + { virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0; }; - struct device_impl_base { - virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0; + struct device_impl_base + { + virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, + CallArgs... args) const = 0; }; - template < typename T > + template struct base_impl_type : impl_base { /// /// move construct an object of type T in dest as a copy of a T from src and /// destroy the T obj in src /// - virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override + virtual void move_destroy(void_ptr_wrapper dest, + void_ptr_wrapper src) const override { T* dest_as_T = static_cast(dest.ptr); T* src_as_T = static_cast(src.ptr); - new(dest_as_T) T(std::move(*src_as_T)); + new (dest_as_T) T(std::move(*src_as_T)); (*src_as_T).~T(); } @@ -255,7 +287,7 @@ struct Dispatcher + template struct host_impl_type : host_impl_base { /// @@ -268,20 +300,22 @@ struct Dispatcher + template struct device_impl_type : device_impl_base { /// /// invoke the call operator of the object of type T in obj with args /// - virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override + virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, + CallArgs... args) const override { const T* obj_as_T = static_cast(obj.ptr); (*obj_as_T)(std::forward(args)...); } }; - struct mover_type { + struct mover_type + { impl_base* m_impl; void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const { @@ -289,7 +323,8 @@ struct Dispatcherinvoke(obj, std::forward(args)...); } }; - using invoker_type = std::conditional_t; + using invoker_type = std:: + conditional_t; - struct destroyer_type { + struct destroyer_type + { impl_base* m_impl; - void operator()(void_ptr_wrapper obj) const - { - m_impl->destroy(obj); - } + void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); } }; // This can't be a cuda device lambda due to compiler limitations - template < typename T > - struct DeviceImplTypeFactory { + template + struct DeviceImplTypeFactory + { using value_type = device_impl_type*; - RAJA_DEVICE value_type operator()() { + RAJA_DEVICE value_type operator()() + { #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) return nullptr; #else @@ -333,16 +368,17 @@ struct Dispatcher* = nullptr > - static inline Dispatcher makeDispatcher() { + template * = nullptr> + static inline Dispatcher makeDispatcher() + { static base_impl_type s_base_impl; static host_impl_type s_host_impl; - return { mover_type{&s_base_impl}, - host_invoker_type{&s_host_impl}, - destroyer_type{&s_base_impl}, - sizeof(T) - }; + return {mover_type{&s_base_impl}, + host_invoker_type{&s_host_impl}, + destroyer_type{&s_base_impl}, + sizeof(T)}; } /// /// create a Dispatcher that can be used on the device for objects of type T @@ -354,17 +390,19 @@ struct Dispatcher* = nullptr> - static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) { + template * = nullptr> + static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) + { static base_impl_type s_base_impl; - static device_impl_type* s_device_impl_ptr{ - std::forward(createOnDevice)(DeviceImplTypeFactory{}) }; - return { mover_type{&s_base_impl}, - device_invoker_type{s_device_impl_ptr}, - destroyer_type{&s_base_impl}, - sizeof(T) - }; + static device_impl_type* s_device_impl_ptr{std::forward( + createOnDevice)(DeviceImplTypeFactory{})}; + return {mover_type{&s_base_impl}, + device_invoker_type{s_device_impl_ptr}, + destroyer_type{&s_base_impl}, + sizeof(T)}; } mover_type move_construct_destroy; @@ -375,17 +413,23 @@ struct Dispatcher -struct dispatcher_transform_types<::RAJA::direct_dispatch, holder_type> { - using type = ::RAJA::direct_dispatch...>; +template +struct dispatcher_transform_types<::RAJA::direct_dispatch, holder_type> +{ + using type = + ::RAJA::direct_dispatch...>; }; /*! * Version of Dispatcher that does direct dispatch to zero callable types. * It implements the interface with callable objects. */ -template < Platform platform, typename DispatcherID, typename ... CallArgs > -struct Dispatcher, DispatcherID, CallArgs...> { +template +struct Dispatcher, + DispatcherID, + CallArgs...> +{ static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); using dispatch_policy = ::RAJA::direct_dispatch<>; using void_ptr_wrapper = DispatcherVoidPtrWrapper; @@ -395,40 +439,41 @@ struct Dispatcher, DispatcherID, CallArgs... /// move construct an object of type T in dest as a copy of a T from src and /// destroy the T obj in src /// - struct mover_type { - void operator()(void_ptr_wrapper, void_ptr_wrapper) const - { } + struct mover_type + { + void operator()(void_ptr_wrapper, void_ptr_wrapper) const {} }; /// /// invoke the call operator of the object of type T in obj with args /// - struct host_invoker_type { - void operator()(void_cptr_wrapper, CallArgs...) const - { } + struct host_invoker_type + { + void operator()(void_cptr_wrapper, CallArgs...) const {} }; - struct device_invoker_type { - RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const - { } + struct device_invoker_type + { + RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const {} }; - using invoker_type = std::conditional_t; + using invoker_type = std:: + conditional_t; /// /// destroy the object of type T in obj /// - struct destroyer_type { - void operator()(void_ptr_wrapper) const - { } + struct destroyer_type + { + void operator()(void_ptr_wrapper) const {} }; /// /// create a Dispatcher that can be used on the host for objects of type T /// - template< typename T, - bool uhi = use_host_invoke, std::enable_if_t* = nullptr > - static inline Dispatcher makeDispatcher() { + template * = nullptr> + static inline Dispatcher makeDispatcher() + { return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)}; } /// @@ -437,9 +482,12 @@ struct Dispatcher, DispatcherID, CallArgs... /// Ignore the CreateOnDevice object as the same invoker object can be used /// on the host and device. /// - template< typename T, typename CreateOnDevice, - bool uhi = use_host_invoke, std::enable_if_t* = nullptr > - static inline Dispatcher makeDispatcher(CreateOnDevice&&) { + template * = nullptr> + static inline Dispatcher makeDispatcher(CreateOnDevice&&) + { return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)}; } @@ -453,8 +501,15 @@ struct Dispatcher, DispatcherID, CallArgs... * Version of Dispatcher that does direct dispatch to a single callable type. * It implements the interface with callable objects. */ -template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs > -struct Dispatcher, DispatcherID, CallArgs...> { +template +struct Dispatcher, + DispatcherID, + CallArgs...> +{ static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); using dispatch_policy = ::RAJA::direct_dispatch; using void_ptr_wrapper = DispatcherVoidPtrWrapper; @@ -464,12 +519,13 @@ struct Dispatcher, DispatcherID, CallArgs.. /// move construct an object of type T in dest as a copy of a T from src and /// destroy the T obj in src /// - struct mover_type { + struct mover_type + { void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const { T* dest_as_T = static_cast(dest.ptr); T* src_as_T = static_cast(src.ptr); - new(dest_as_T) T(std::move(*src_as_T)); + new (dest_as_T) T(std::move(*src_as_T)); (*src_as_T).~T(); } }; @@ -477,28 +533,30 @@ struct Dispatcher, DispatcherID, CallArgs.. /// /// invoke the call operator of the object of type T in obj with args /// - struct host_invoker_type { + struct host_invoker_type + { void operator()(void_cptr_wrapper obj, CallArgs... args) const { const T* obj_as_T = static_cast(obj.ptr); (*obj_as_T)(std::forward(args)...); } }; - struct device_invoker_type { + struct device_invoker_type + { RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const { const T* obj_as_T = static_cast(obj.ptr); (*obj_as_T)(std::forward(args)...); } }; - using invoker_type = std::conditional_t; + using invoker_type = std:: + conditional_t; /// /// destroy the object of type T in obj /// - struct destroyer_type { + struct destroyer_type + { void operator()(void_ptr_wrapper obj) const { T* obj_as_T = static_cast(obj.ptr); @@ -509,10 +567,13 @@ struct Dispatcher, DispatcherID, CallArgs.. /// /// create a Dispatcher that can be used on the host for objects of type T /// - template< typename U, - bool uhi = use_host_invoke, std::enable_if_t* = nullptr > - static inline Dispatcher makeDispatcher() { - static_assert(std::is_same::value, "U must be in direct_dispatch types"); + template * = nullptr> + static inline Dispatcher makeDispatcher() + { + static_assert(std::is_same::value, + "U must be in direct_dispatch types"); return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)}; } /// @@ -521,10 +582,14 @@ struct Dispatcher, DispatcherID, CallArgs.. /// Ignore the CreateOnDevice object as the same invoker object can be used /// on the host and device. /// - template< typename U, typename CreateOnDevice, - bool uhi = use_host_invoke, std::enable_if_t* = nullptr > - static inline Dispatcher makeDispatcher(CreateOnDevice&&) { - static_assert(std::is_same::value, "U must be in direct_dispatch types"); + template * = nullptr> + static inline Dispatcher makeDispatcher(CreateOnDevice&&) + { + static_assert(std::is_same::value, + "U must be in direct_dispatch types"); return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)}; } @@ -538,46 +603,55 @@ struct Dispatcher, DispatcherID, CallArgs.. * Version of Dispatcher that does direct dispatch to multiple callable types. * It implements the interface with callable objects. */ -template < typename T0, typename T1, typename ... TNs, - Platform platform, typename DispatcherID, typename ... CallArgs > -struct Dispatcher, - DispatcherID, CallArgs...> { +template +struct Dispatcher, + DispatcherID, + CallArgs...> +{ static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); using dispatch_policy = ::RAJA::direct_dispatch; using void_ptr_wrapper = DispatcherVoidPtrWrapper; using void_cptr_wrapper = DispatcherVoidConstPtrWrapper; using id_type = int; - using callable_indices = camp::make_int_seq_t; + using callable_indices = camp::make_int_seq_t; using callable_types = camp::list; /// /// move construct an object of type T in dest as a copy of a T from src and /// destroy the T obj in src /// - struct mover_type { + struct mover_type + { id_type id; void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const { - impl_helper(callable_indices{}, callable_types{}, - dest, src); + impl_helper(callable_indices{}, callable_types{}, dest, src); } private: - template < int ... id_types, typename ... Ts > - void impl_helper(camp::int_seq, camp::list, - void_ptr_wrapper dest, void_ptr_wrapper src) const + template + void impl_helper(camp::int_seq, + camp::list, + void_ptr_wrapper dest, + void_ptr_wrapper src) const { camp::sink(((id_types == id) ? (impl(dest, src), 0) : 0)...); } - template < typename T > + template void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const { T* dest_as_T = static_cast(dest.ptr); T* src_as_T = static_cast(src.ptr); - new(dest_as_T) T(std::move(*src_as_T)); + new (dest_as_T) T(std::move(*src_as_T)); (*src_as_T).~T(); } }; @@ -585,79 +659,93 @@ struct Dispatcher, /// /// invoke the call operator of the object of type T in obj with args /// - struct host_invoker_type { + struct host_invoker_type + { id_type id; void operator()(void_cptr_wrapper obj, CallArgs... args) const { - impl_helper(callable_indices{}, callable_types{}, - obj, std::forward(args)...); + impl_helper(callable_indices{}, + callable_types{}, + obj, + std::forward(args)...); } private: - template < int ... id_types, typename ... Ts > - void impl_helper(camp::int_seq, camp::list, - void_cptr_wrapper obj, CallArgs... args) const + template + void impl_helper(camp::int_seq, + camp::list, + void_cptr_wrapper obj, + CallArgs... args) const { - camp::sink(((id_types == id) ? (impl(obj, std::forward(args)...), 0) : 0)...); + camp::sink(((id_types == id) + ? (impl(obj, std::forward(args)...), 0) + : 0)...); } - template < typename T > + template void impl(void_cptr_wrapper obj, CallArgs... args) const { const T* obj_as_T = static_cast(obj.ptr); (*obj_as_T)(std::forward(args)...); } }; - struct device_invoker_type { + struct device_invoker_type + { id_type id; RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const { - impl_helper(callable_indices{}, callable_types{}, - obj, std::forward(args)...); + impl_helper(callable_indices{}, + callable_types{}, + obj, + std::forward(args)...); } private: - template < int ... id_types, typename ... Ts > - RAJA_DEVICE void impl_helper(camp::int_seq, camp::list, - void_cptr_wrapper obj, CallArgs... args) const + template + RAJA_DEVICE void impl_helper(camp::int_seq, + camp::list, + void_cptr_wrapper obj, + CallArgs... args) const { - camp::sink(((id_types == id) ? (impl(obj, std::forward(args)...), 0) : 0)...); + camp::sink(((id_types == id) + ? (impl(obj, std::forward(args)...), 0) + : 0)...); } - template < typename T > + template RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const { const T* obj_as_T = static_cast(obj.ptr); (*obj_as_T)(std::forward(args)...); } }; - using invoker_type = std::conditional_t; + using invoker_type = std:: + conditional_t; /// /// destroy the object of type T in obj /// - struct destroyer_type { + struct destroyer_type + { id_type id; void operator()(void_ptr_wrapper obj) const { - impl_helper(callable_indices{}, callable_types{}, - obj); + impl_helper(callable_indices{}, callable_types{}, obj); } private: - template < int ... id_types, typename ... Ts > - void impl_helper(camp::int_seq, camp::list, - void_ptr_wrapper obj) const + template + void impl_helper(camp::int_seq, + camp::list, + void_ptr_wrapper obj) const { camp::sink(((id_types == id) ? (impl(obj), 0) : 0)...); } - template < typename T > + template void impl(void_ptr_wrapper obj) const { T* obj_as_T = static_cast(obj.ptr); @@ -671,12 +759,13 @@ struct Dispatcher, /// The id is just the index of T in the list of callable_types. /// If T is not in Ts return -1. /// - template < typename T, int ... id_types, typename ... Ts > - static constexpr id_type get_id(camp::int_seq, camp::list) + template + static constexpr id_type get_id(camp::int_seq, + camp::list) { id_type id{-1}; // quiet UB warning by sequencing assignment to id with list initialization - int unused[] {0, (std::is_same::value ? ((id = id_types), 0) : 0)...}; + int unused[]{0, (std::is_same::value ? ((id = id_types), 0) : 0)...}; camp::sink(unused); // quiet unused var warning return id; } @@ -684,12 +773,16 @@ struct Dispatcher, /// /// create a Dispatcher that can be used on the host for objects of type T /// - template< typename T, - bool uhi = use_host_invoke, std::enable_if_t* = nullptr > - static inline Dispatcher makeDispatcher() { - static constexpr id_type id = get_id(callable_indices{}, callable_types{}); + template * = nullptr> + static inline Dispatcher makeDispatcher() + { + static constexpr id_type id = + get_id(callable_indices{}, callable_types{}); static_assert(id != id_type(-1), "T must be in direct_dispatch types"); - return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)}; + return { + mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)}; } /// /// create a Dispatcher that can be used on the device for objects of type T @@ -697,12 +790,17 @@ struct Dispatcher, /// Ignore the CreateOnDevice object as the same invoker object can be used /// on the host and device. /// - template< typename T, typename CreateOnDevice, - bool uhi = use_host_invoke, std::enable_if_t* = nullptr > - static inline Dispatcher makeDispatcher(CreateOnDevice&&) { - static constexpr id_type id = get_id(callable_indices{}, callable_types{}); + template * = nullptr> + static inline Dispatcher makeDispatcher(CreateOnDevice&&) + { + static constexpr id_type id = + get_id(callable_indices{}, callable_types{}); static_assert(id != id_type(-1), "T must be in direct_dispatch types"); - return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)}; + return { + mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)}; } mover_type move_construct_destroy; @@ -718,8 +816,8 @@ struct Dispatcher, // template < typename T, typename Dispatcher_T > // inline const Dispatcher_T* get_Dispatcher(work_policy const&); -} // namespace detail +} // namespace detail -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp index 9645f73050..e07b64cdb2 100644 --- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp @@ -40,18 +40,18 @@ namespace detail /*! * A body and args holder for storing loops that are being executed in foralls */ -template +template struct HoldBodyArgs_base { // NOTE: This constructor is disabled when body_in is not LoopBody // to avoid it conflicting with the copy and move constructors - template < typename body_in, - typename = typename std::enable_if< - std::is_same>::value>::type > + template >::value>::type> HoldBodyArgs_base(body_in&& body, Args... args) - : m_body(std::forward(body)) - , m_arg_tuple(std::forward(args)...) - { } + : m_body(std::forward(body)), + m_arg_tuple(std::forward(args)...) + {} protected: LoopBody m_body; @@ -62,7 +62,7 @@ struct HoldBodyArgs_base * A body and args holder for storing loops that are being executed in foralls * that run on the host */ -template +template struct HoldBodyArgs_host : HoldBodyArgs_base { using base = HoldBodyArgs_base; @@ -73,7 +73,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base invoke(i, camp::make_idx_seq_t{}); } - template < camp::idx_t ... Is > + template RAJA_INLINE void invoke(index_type i, camp::idx_seq) const { this->m_body(i, get(this->m_arg_tuple)...); @@ -84,7 +84,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base * A body and args holder for storing loops that are being executed in foralls * that run on the device */ -template +template struct HoldBodyArgs_device : HoldBodyArgs_base { using base = HoldBodyArgs_base; @@ -95,7 +95,7 @@ struct HoldBodyArgs_device : HoldBodyArgs_base invoke(i, camp::make_idx_seq_t{}); } - template < camp::idx_t ... Is > + template RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq) const { this->m_body(i, get(this->m_arg_tuple)...); @@ -105,21 +105,24 @@ struct HoldBodyArgs_device : HoldBodyArgs_base /*! * A body and segment holder for storing loops that will be executed as foralls */ -template +template struct HoldForall { using resource_type = typename resources::get_resource::type; using HoldBodyArgs = typename std::conditional< !type_traits::is_device_exec_policy::value, HoldBodyArgs_host, - HoldBodyArgs_device >::type; + HoldBodyArgs_device>::type; - template < typename segment_in, typename body_in > + template HoldForall(segment_in&& segment, body_in&& body) - : m_segment(std::forward(segment)) - , m_body(std::forward(body)) - { } + : m_segment(std::forward(segment)), + m_body(std::forward(body)) + {} RAJA_INLINE void operator()(resource_type r, Args... args) const { @@ -143,7 +146,7 @@ template + typename... Args> struct WorkRunner; @@ -156,7 +159,7 @@ template + typename... Args> struct WorkRunnerForallOrdered_base { using exec_policy = EXEC_POLICY_T; @@ -164,20 +167,24 @@ struct WorkRunnerForallOrdered_base using dispatch_policy = DISPATCH_POLICY_T; using Allocator = ALLOCATOR_T; using index_type = INDEX_T; - using resource_type = typename resources::get_resource::type; + using resource_type = + typename resources::get_resource::type; using forall_exec_policy = FORALL_EXEC_POLICY; // The type that will hold the segment and loop body in work storage - struct holder_type { - template < typename T > - using type = HoldForall>::type, // segment_type - typename camp::at>::type, // loop_type - index_type, Args...>; + struct holder_type + { + template + using type = + HoldForall>::type, // segment_type + typename camp::at>::type, // loop_type + index_type, + Args...>; }; /// - template < typename T > + template using holder_type_t = typename holder_type::template type; // The policy indicating where the call function is invoked @@ -186,33 +193,41 @@ struct WorkRunnerForallOrdered_base // The Dispatcher policy with holder_types used internally to handle the // ranges and callables passed in by the user. - using dispatcher_holder_policy = dispatcher_transform_types_t; + using dispatcher_holder_policy = + dispatcher_transform_types_t; - using dispatcher_type = Dispatcher; + using dispatcher_type = Dispatcher; WorkRunnerForallOrdered_base() = default; WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete; - WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete; + WorkRunnerForallOrdered_base& + operator=(WorkRunnerForallOrdered_base const&) = delete; - WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default; - WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default; + WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base&&) = default; + WorkRunnerForallOrdered_base& + operator=(WorkRunnerForallOrdered_base&&) = default; // runner interfaces with storage to enqueue so the runner can get // information from the segment and loop at enqueue time - template < typename WorkContainer, typename segment_T, typename loop_T > + template inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop) { - using holder = holder_type_t, camp::decay>>; + using holder = + holder_type_t, camp::decay>>; storage.template emplace( get_Dispatcher(dispatcher_exec_policy{}), - std::forward(seg), std::forward(loop)); + std::forward(seg), + std::forward(loop)); } // clear any state so ready to be destroyed or reused - void clear() - { } + void clear() {} // no extra storage required here using per_run_storage = int; @@ -227,29 +242,27 @@ template + typename... Args> struct WorkRunnerForallOrdered - : WorkRunnerForallOrdered_base< - FORALL_EXEC_POLICY, - EXEC_POLICY_T, - ORDER_POLICY_T, - DISPATCH_POLICY_T, - ALLOCATOR_T, - INDEX_T, - Args...> + : WorkRunnerForallOrdered_base { - using base = WorkRunnerForallOrdered_base< - FORALL_EXEC_POLICY, - EXEC_POLICY_T, - ORDER_POLICY_T, - DISPATCH_POLICY_T, - ALLOCATOR_T, - INDEX_T, - Args...>; + using base = WorkRunnerForallOrdered_base; using base::base; // run the loops using forall in the order that they were enqueued - template < typename WorkContainer > + template typename base::per_run_storage run(WorkContainer const& storage, typename base::resource_type r, Args... args) const @@ -259,7 +272,8 @@ struct WorkRunnerForallOrdered typename base::per_run_storage run_storage{}; auto end = storage.end(); - for (auto iter = storage.begin(); iter != end; ++iter) { + for (auto iter = storage.begin(); iter != end; ++iter) + { value_type::host_call(&*iter, r, args...); } @@ -276,29 +290,28 @@ template + typename... Args> struct WorkRunnerForallReverse - : WorkRunnerForallOrdered_base< - FORALL_EXEC_POLICY, - EXEC_POLICY_T, - ORDER_POLICY_T, - DISPATCH_POLICY_T, - ALLOCATOR_T, - INDEX_T, - Args...> + : WorkRunnerForallOrdered_base { - using base = WorkRunnerForallOrdered_base< - FORALL_EXEC_POLICY, - EXEC_POLICY_T, - ORDER_POLICY_T, - DISPATCH_POLICY_T, - ALLOCATOR_T, - INDEX_T, - Args...>; + using base = WorkRunnerForallOrdered_base; using base::base; - // run the loops using forall in the reverse order to the order they were enqueued - template < typename WorkContainer > + // run the loops using forall in the reverse order to the order they were + // enqueued + template typename base::per_run_storage run(WorkContainer const& storage, typename base::resource_type r, Args... args) const @@ -308,16 +321,17 @@ struct WorkRunnerForallReverse typename base::per_run_storage run_storage{}; auto begin = storage.begin(); - for (auto iter = storage.end(); iter != begin; --iter) { - value_type::host_call(&*(iter-1), r, args...); + for (auto iter = storage.end(); iter != begin; --iter) + { + value_type::host_call(&*(iter - 1), r, args...); } return run_storage; } }; -} // namespace detail +} // namespace detail -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp index 52631d108f..8a43982bd3 100644 --- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp +++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp @@ -46,7 +46,7 @@ namespace detail // operator - ( iterator_base const& ) // operator == ( iterator_base const& ) // operator < ( iterator_base const& ) -template < typename iterator_base > +template struct random_access_iterator : iterator_base { using base = iterator_base; @@ -59,10 +59,10 @@ struct random_access_iterator : iterator_base using base::base; random_access_iterator(random_access_iterator const&) = default; - random_access_iterator(random_access_iterator &&) = default; + random_access_iterator(random_access_iterator&&) = default; random_access_iterator& operator=(random_access_iterator const&) = default; - random_access_iterator& operator=(random_access_iterator &&) = default; + random_access_iterator& operator=(random_access_iterator&&) = default; RAJA_HOST_DEVICE reference operator*() const @@ -70,10 +70,7 @@ struct random_access_iterator : iterator_base return *static_cast(*this); } - RAJA_HOST_DEVICE pointer operator->() const - { - return &(*(*this)); - } + RAJA_HOST_DEVICE pointer operator->() const { return &(*(*this)); } RAJA_HOST_DEVICE reference operator[](difference_type i) const { @@ -120,68 +117,75 @@ struct random_access_iterator : iterator_base return *this; } - RAJA_HOST_DEVICE friend inline random_access_iterator operator+( - random_access_iterator const& lhs, difference_type rhs) + RAJA_HOST_DEVICE friend inline random_access_iterator + operator+(random_access_iterator const& lhs, difference_type rhs) { random_access_iterator copy = lhs; copy += rhs; return copy; } - RAJA_HOST_DEVICE friend inline random_access_iterator operator+( - difference_type lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline random_access_iterator + operator+(difference_type lhs, random_access_iterator const& rhs) { random_access_iterator copy = rhs; copy += lhs; return copy; } - RAJA_HOST_DEVICE friend inline random_access_iterator operator-( - random_access_iterator const& lhs, difference_type rhs) + RAJA_HOST_DEVICE friend inline random_access_iterator + operator-(random_access_iterator const& lhs, difference_type rhs) { random_access_iterator copy = lhs; copy -= rhs; return copy; } - RAJA_HOST_DEVICE friend inline difference_type operator-( - random_access_iterator const& lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline difference_type + operator-(random_access_iterator const& lhs, + random_access_iterator const& rhs) { return static_cast(lhs) - static_cast(rhs); } - RAJA_HOST_DEVICE friend inline bool operator==( - random_access_iterator const& lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline bool + operator==(random_access_iterator const& lhs, + random_access_iterator const& rhs) { return static_cast(lhs) == static_cast(rhs); } - RAJA_HOST_DEVICE friend inline bool operator!=( - random_access_iterator const& lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline bool + operator!=(random_access_iterator const& lhs, + random_access_iterator const& rhs) { return !(lhs == rhs); } - RAJA_HOST_DEVICE friend inline bool operator<( - random_access_iterator const& lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline bool + operator<(random_access_iterator const& lhs, + random_access_iterator const& rhs) { return static_cast(lhs) < static_cast(rhs); } - RAJA_HOST_DEVICE friend inline bool operator<=( - random_access_iterator const& lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline bool + operator<=(random_access_iterator const& lhs, + random_access_iterator const& rhs) { return !(rhs < lhs); } - RAJA_HOST_DEVICE friend inline bool operator>( - random_access_iterator const& lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline bool + operator>(random_access_iterator const& lhs, + random_access_iterator const& rhs) { return rhs < lhs; } - RAJA_HOST_DEVICE friend inline bool operator>=( - random_access_iterator const& lhs, random_access_iterator const& rhs) + RAJA_HOST_DEVICE friend inline bool + operator>=(random_access_iterator const& lhs, + random_access_iterator const& rhs) { return !(lhs < rhs); } @@ -191,10 +195,12 @@ struct random_access_iterator : iterator_base /*! * A storage container for work groups */ -template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T > +template class WorkStorage; -template < typename ALLOCATOR_T, typename Dispatcher_T > +template class WorkStorage { using allocator_traits_type = std::allocator_traits; @@ -202,15 +208,17 @@ class WorkStorage typename allocator_traits_type::propagate_on_container_copy_assignment; using propagate_on_container_move_assignment = typename allocator_traits_type::propagate_on_container_move_assignment; - using propagate_on_container_swap = + using propagate_on_container_swap = typename allocator_traits_type::propagate_on_container_swap; - static_assert(std::is_same::value, + static_assert( + std::is_same::value, "WorkStorage expects an allocator for 'char's."); + public: using storage_policy = RAJA::array_of_pointers; using dispatcher_type = Dispatcher_T; - template < typename holder > + template using true_value_type = WorkStruct; using value_type = GenericWorkStruct; @@ -231,8 +239,8 @@ class WorkStorage }; public: - - // iterator base class for accessing stored WorkStructs outside of the container + // iterator base class for accessing stored WorkStructs outside of the + // container struct const_iterator_base { using value_type = const typename WorkStorage::value_type; @@ -241,14 +249,9 @@ class WorkStorage using difference_type = typename WorkStorage::difference_type; using iterator_category = std::random_access_iterator_tag; - const_iterator_base(const pointer_and_size* ptrptr) - : m_ptrptr(ptrptr) - { } + const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {} - RAJA_HOST_DEVICE reference operator*() const - { - return *(m_ptrptr->ptr); - } + RAJA_HOST_DEVICE reference operator*() const { return *(m_ptrptr->ptr); } RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n) { @@ -256,20 +259,23 @@ class WorkStorage return *this; } - RAJA_HOST_DEVICE friend inline difference_type operator-( - const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter) + RAJA_HOST_DEVICE friend inline difference_type + operator-(const_iterator_base const& lhs_iter, + const_iterator_base const& rhs_iter) { return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr; } - RAJA_HOST_DEVICE friend inline bool operator==( - const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter) + RAJA_HOST_DEVICE friend inline bool + operator==(const_iterator_base const& lhs_iter, + const_iterator_base const& rhs_iter) { return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr; } - RAJA_HOST_DEVICE friend inline bool operator<( - const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter) + RAJA_HOST_DEVICE friend inline bool + operator<(const_iterator_base const& lhs_iter, + const_iterator_base const& rhs_iter) { return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr; } @@ -282,22 +288,22 @@ class WorkStorage explicit WorkStorage(allocator_type const& aloc) - : m_vec(0, aloc) - , m_aloc(aloc) - { } + : m_vec(0, aloc), m_aloc(aloc) + {} WorkStorage(WorkStorage const&) = delete; WorkStorage& operator=(WorkStorage const&) = delete; WorkStorage(WorkStorage&& rhs) - : m_vec(std::move(rhs.m_vec)) - , m_aloc(std::move(rhs.m_aloc)) - { } + : m_vec(std::move(rhs.m_vec)), m_aloc(std::move(rhs.m_aloc)) + {} WorkStorage& operator=(WorkStorage&& rhs) { - if (this != &rhs) { - move_assign_private(std::move(rhs), propagate_on_container_move_assignment{}); + if (this != &rhs) + { + move_assign_private(std::move(rhs), + propagate_on_container_move_assignment{}); } return *this; } @@ -312,33 +318,26 @@ class WorkStorage } // number of loops stored - size_type size() const - { - return m_vec.size(); - } + size_type size() const { return m_vec.size(); } - const_iterator begin() const - { - return const_iterator(m_vec.begin()); - } + const_iterator begin() const { return const_iterator(m_vec.begin()); } - const_iterator end() const - { - return const_iterator(m_vec.end()); - } + const_iterator end() const { return const_iterator(m_vec.end()); } // number of bytes used for storage of loops size_type storage_size() const { size_type storage_size_nbytes = 0; - for (size_t i = 0; i < m_vec.size(); ++i) { + for (size_t i = 0; i < m_vec.size(); ++i) + { storage_size_nbytes += m_vec[i].size; } return storage_size_nbytes; } - template < typename holder, typename ... holder_ctor_args > - void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) + template + void emplace(const dispatcher_type* dispatcher, + holder_ctor_args&&... ctor_args) { m_vec.emplace_back(create_value( dispatcher, std::forward(ctor_args)...)); @@ -347,20 +346,21 @@ class WorkStorage // destroy all stored loops, deallocates all storage void clear() { - while (!m_vec.empty()) { + while (!m_vec.empty()) + { destroy_value(m_vec.back()); m_vec.pop_back(); } m_vec.shrink_to_fit(); } - ~WorkStorage() - { - clear(); - } + ~WorkStorage() { clear(); } private: - RAJAVec> m_vec; + RAJAVec< + pointer_and_size, + typename allocator_traits_type::template rebind_alloc> + m_vec; allocator_type m_aloc; // move assignment if allocator propagates on move assignment @@ -375,12 +375,16 @@ class WorkStorage void move_assign_private(WorkStorage&& rhs, std::false_type) { clear(); - if (m_aloc == rhs.m_aloc) { + if (m_aloc == rhs.m_aloc) + { // take storage if allocators compare equal m_vec = std::move(rhs.m_vec); - } else { + } + else + { // allocate new storage if allocators do not compare equal - for (size_type i = 0; i < rhs.m_vec.size(); ++i) { + for (size_type i = 0; i < rhs.m_vec.size(); ++i) + { m_vec.emplace_back(move_destroy_value(std::move(rhs), rhs.m_vec[i])); } rhs.m_vec.clear(); @@ -389,7 +393,7 @@ class WorkStorage } // allocate and construct value in storage - template < typename holder, typename ... holder_ctor_args > + template pointer_and_size create_value(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { @@ -414,8 +418,10 @@ class WorkStorage value_type::move_destroy(value_ptr, other_value_and_size.ptr); - allocator_traits_type::deallocate(rhs.m_aloc, - reinterpret_cast(other_value_and_size.ptr), other_value_and_size.size); + allocator_traits_type::deallocate( + rhs.m_aloc, + reinterpret_cast(other_value_and_size.ptr), + other_value_and_size.size); return pointer_and_size{value_ptr, other_value_and_size.size}; } @@ -424,12 +430,14 @@ class WorkStorage void destroy_value(pointer_and_size value_and_size_ptr) { value_type::destroy(value_and_size_ptr.ptr); - allocator_traits_type::deallocate(m_aloc, - reinterpret_cast(value_and_size_ptr.ptr), value_and_size_ptr.size); + allocator_traits_type::deallocate( + m_aloc, + reinterpret_cast(value_and_size_ptr.ptr), + value_and_size_ptr.size); } }; -template < typename ALLOCATOR_T, typename Dispatcher_T > +template class WorkStorage { using allocator_traits_type = std::allocator_traits; @@ -437,15 +445,17 @@ class WorkStorage typename allocator_traits_type::propagate_on_container_copy_assignment; using propagate_on_container_move_assignment = typename allocator_traits_type::propagate_on_container_move_assignment; - using propagate_on_container_swap = + using propagate_on_container_swap = typename allocator_traits_type::propagate_on_container_swap; - static_assert(std::is_same::value, + static_assert( + std::is_same::value, "WorkStorage expects an allocator for 'char's."); + public: using storage_policy = RAJA::ragged_array_of_objects; using dispatcher_type = Dispatcher_T; - template < typename holder > + template using true_value_type = WorkStruct; using value_type = GenericWorkStruct; @@ -457,7 +467,8 @@ class WorkStorage using pointer = value_type*; using const_pointer = const value_type*; - // iterator base class for accessing stored WorkStructs outside of the container + // iterator base class for accessing stored WorkStructs outside of the + // container struct const_iterator_base { using value_type = const typename WorkStorage::value_type; @@ -467,14 +478,12 @@ class WorkStorage using iterator_category = std::random_access_iterator_tag; const_iterator_base(const char* array_begin, const size_type* offset_iter) - : m_array_begin(array_begin) - , m_offset_iter(offset_iter) - { } + : m_array_begin(array_begin), m_offset_iter(offset_iter) + {} RAJA_HOST_DEVICE reference operator*() const { - return *reinterpret_cast( - m_array_begin + *m_offset_iter); + return *reinterpret_cast(m_array_begin + *m_offset_iter); } RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n) @@ -483,20 +492,23 @@ class WorkStorage return *this; } - RAJA_HOST_DEVICE friend inline difference_type operator-( - const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter) + RAJA_HOST_DEVICE friend inline difference_type + operator-(const_iterator_base const& lhs_iter, + const_iterator_base const& rhs_iter) { return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter; } - RAJA_HOST_DEVICE friend inline bool operator==( - const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter) + RAJA_HOST_DEVICE friend inline bool + operator==(const_iterator_base const& lhs_iter, + const_iterator_base const& rhs_iter) { return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter; } - RAJA_HOST_DEVICE friend inline bool operator<( - const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter) + RAJA_HOST_DEVICE friend inline bool + operator<(const_iterator_base const& lhs_iter, + const_iterator_base const& rhs_iter) { return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter; } @@ -510,19 +522,18 @@ class WorkStorage explicit WorkStorage(allocator_type const& aloc) - : m_offsets(0, aloc) - , m_aloc(aloc) - { } + : m_offsets(0, aloc), m_aloc(aloc) + {} WorkStorage(WorkStorage const&) = delete; WorkStorage& operator=(WorkStorage const&) = delete; WorkStorage(WorkStorage&& rhs) - : m_offsets(std::move(rhs.m_offsets)) - , m_array_begin(rhs.m_array_begin) - , m_array_end(rhs.m_array_end) - , m_array_cap(rhs.m_array_cap) - , m_aloc(std::move(rhs.m_aloc)) + : m_offsets(std::move(rhs.m_offsets)), + m_array_begin(rhs.m_array_begin), + m_array_end(rhs.m_array_end), + m_array_cap(rhs.m_array_cap), + m_aloc(std::move(rhs.m_aloc)) { rhs.m_array_begin = nullptr; rhs.m_array_end = nullptr; @@ -531,8 +542,10 @@ class WorkStorage WorkStorage& operator=(WorkStorage&& rhs) { - if (this != &rhs) { - move_assign_private(std::move(rhs), propagate_on_container_move_assignment{}); + if (this != &rhs) + { + move_assign_private(std::move(rhs), + propagate_on_container_move_assignment{}); } return *this; } @@ -546,10 +559,7 @@ class WorkStorage } // number of loops stored - size_type size() const - { - return m_offsets.size(); - } + size_type size() const { return m_offsets.size(); } const_iterator begin() const { @@ -562,17 +572,15 @@ class WorkStorage } // number of bytes used for storage of loops - size_type storage_size() const - { - return m_array_end - m_array_begin; - } + size_type storage_size() const { return m_array_end - m_array_begin; } - template < typename holder, typename ... holder_ctor_args > - void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) + template + void emplace(const dispatcher_type* dispatcher, + holder_ctor_args&&... ctor_args) { size_type value_offset = storage_size(); - size_type value_size = create_value(value_offset, - dispatcher, std::forward(ctor_args)...); + size_type value_size = create_value( + value_offset, dispatcher, std::forward(ctor_args)...); m_offsets.emplace_back(value_offset); m_array_end += value_size; } @@ -581,24 +589,25 @@ class WorkStorage void clear() { array_clear(); - if (m_array_begin != nullptr) { - allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity()); + if (m_array_begin != nullptr) + { + allocator_traits_type::deallocate( + m_aloc, m_array_begin, storage_capacity()); m_array_begin = nullptr; - m_array_end = nullptr; - m_array_cap = nullptr; + m_array_end = nullptr; + m_array_cap = nullptr; } } - ~WorkStorage() - { - clear(); - } + ~WorkStorage() { clear(); } private: - RAJAVec> m_offsets; + RAJAVec> + m_offsets; char* m_array_begin = nullptr; - char* m_array_end = nullptr; - char* m_array_cap = nullptr; + char* m_array_end = nullptr; + char* m_array_cap = nullptr; allocator_type m_aloc; // move assignment if allocator propagates on move assignment @@ -606,35 +615,39 @@ class WorkStorage { clear(); - m_offsets = std::move(rhs.m_offsets); + m_offsets = std::move(rhs.m_offsets); m_array_begin = rhs.m_array_begin; - m_array_end = rhs.m_array_end ; - m_array_cap = rhs.m_array_cap ; - m_aloc = std::move(rhs.m_aloc); + m_array_end = rhs.m_array_end; + m_array_cap = rhs.m_array_cap; + m_aloc = std::move(rhs.m_aloc); rhs.m_array_begin = nullptr; - rhs.m_array_end = nullptr; - rhs.m_array_cap = nullptr; + rhs.m_array_end = nullptr; + rhs.m_array_cap = nullptr; } // move assignment if allocator does not propagate on move assignment void move_assign_private(WorkStorage&& rhs, std::false_type) { clear(); - if (m_aloc == rhs.m_aloc) { + if (m_aloc == rhs.m_aloc) + { - m_offsets = std::move(rhs.m_offsets); + m_offsets = std::move(rhs.m_offsets); m_array_begin = rhs.m_array_begin; - m_array_end = rhs.m_array_end ; - m_array_cap = rhs.m_array_cap ; + m_array_end = rhs.m_array_end; + m_array_cap = rhs.m_array_cap; rhs.m_array_begin = nullptr; - rhs.m_array_end = nullptr; - rhs.m_array_cap = nullptr; - } else { + rhs.m_array_end = nullptr; + rhs.m_array_cap = nullptr; + } + else + { array_reserve(rhs.storage_size()); - for (size_type i = 0; i < rhs.size(); ++i) { + for (size_type i = 0; i < rhs.size(); ++i) + { m_array_end = m_array_begin + rhs.m_offsets[i]; move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]); m_offsets.emplace_back(rhs.m_offsets[i]); @@ -647,46 +660,45 @@ class WorkStorage } // get loop storage capacity, used and unused in bytes - size_type storage_capacity() const - { - return m_array_cap - m_array_begin; - } + size_type storage_capacity() const { return m_array_cap - m_array_begin; } // get unused loop storage capacity in bytes - size_type storage_unused() const - { - return m_array_cap - m_array_end; - } + size_type storage_unused() const { return m_array_cap - m_array_end; } // reserve space for loop_storage_size bytes of loop storage void array_reserve(size_type loop_storage_size) { - if (loop_storage_size > storage_capacity()) { + if (loop_storage_size > storage_capacity()) + { char* new_array_begin = allocator_traits_type::allocate(m_aloc, loop_storage_size); - char* new_array_end = new_array_begin + storage_size(); - char* new_array_cap = new_array_begin + loop_storage_size; + char* new_array_end = new_array_begin + storage_size(); + char* new_array_cap = new_array_begin + loop_storage_size; - for (size_type i = 0; i < size(); ++i) { + for (size_type i = 0; i < size(); ++i) + { move_destroy_value(new_array_begin + m_offsets[i], - m_array_begin + m_offsets[i]); + m_array_begin + m_offsets[i]); } - if (m_array_begin != nullptr) { - allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity()); + if (m_array_begin != nullptr) + { + allocator_traits_type::deallocate( + m_aloc, m_array_begin, storage_capacity()); } m_array_begin = new_array_begin; - m_array_end = new_array_end ; - m_array_cap = new_array_cap ; + m_array_end = new_array_end; + m_array_cap = new_array_cap; } } // destroy loop objects (does not deallocate array storage) void array_clear() { - while (!m_offsets.empty()) { + while (!m_offsets.empty()) + { destroy_value(m_offsets.back()); m_array_end = m_array_begin + m_offsets.back(); m_offsets.pop_back(); @@ -696,15 +708,17 @@ class WorkStorage // ensure there is enough storage to hold the next loop body at value offset // and store the loop body - template < typename holder, typename ... holder_ctor_args > + template size_type create_value(size_type value_offset, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { const size_type value_size = sizeof(true_value_type); - if (value_size > storage_unused()) { - array_reserve(std::max(storage_size() + value_size, 2*storage_capacity())); + if (value_size > storage_unused()) + { + array_reserve( + std::max(storage_size() + value_size, 2 * storage_capacity())); } pointer value_ptr = reinterpret_cast(m_array_begin + value_offset); @@ -726,13 +740,12 @@ class WorkStorage // destroy the loop body at value offset void destroy_value(size_type value_offset) { - pointer value_ptr = - reinterpret_cast(m_array_begin + value_offset); + pointer value_ptr = reinterpret_cast(m_array_begin + value_offset); value_type::destroy(value_ptr); } }; -template < typename ALLOCATOR_T, typename Dispatcher_T > +template class WorkStorage @@ -742,15 +755,17 @@ class WorkStorage::value, + static_assert( + std::is_same::value, "WorkStorage expects an allocator for 'char's."); + public: using storage_policy = RAJA::constant_stride_array_of_objects; using dispatcher_type = Dispatcher_T; - template < typename holder > + template using true_value_type = WorkStruct; using value_type = GenericWorkStruct; @@ -762,7 +777,8 @@ class WorkStorage; - explicit WorkStorage(allocator_type const& aloc) - : m_aloc(aloc) - { } + explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {} WorkStorage(WorkStorage const&) = delete; WorkStorage& operator=(WorkStorage const&) = delete; WorkStorage(WorkStorage&& rhs) - : m_aloc(std::move(rhs.m_aloc)) - , m_stride(rhs.m_stride) - , m_array_begin(rhs.m_array_begin) - , m_array_end(rhs.m_array_end) - , m_array_cap(rhs.m_array_cap) + : m_aloc(std::move(rhs.m_aloc)), + m_stride(rhs.m_stride), + m_array_begin(rhs.m_array_begin), + m_array_end(rhs.m_array_end), + m_array_cap(rhs.m_array_cap) { // do not reset stride, leave it for reuse rhs.m_array_begin = nullptr; - rhs.m_array_end = nullptr; - rhs.m_array_cap = nullptr; + rhs.m_array_end = nullptr; + rhs.m_array_cap = nullptr; } WorkStorage& operator=(WorkStorage&& rhs) { - if (this != &rhs) { - move_assign_private(std::move(rhs), propagate_on_container_move_assignment{}); + if (this != &rhs) + { + move_assign_private(std::move(rhs), + propagate_on_container_move_assignment{}); } return *this; } @@ -847,35 +865,28 @@ class WorkStorage - void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) + template + void emplace(const dispatcher_type* dispatcher, + holder_ctor_args&&... ctor_args) { - create_value(dispatcher, std::forward(ctor_args)...); + create_value(dispatcher, + std::forward(ctor_args)...); m_array_end += m_stride; } @@ -883,64 +894,67 @@ class WorkStorage storage_capacity() || new_stride > m_stride) { + if (loop_storage_size > storage_capacity() || new_stride > m_stride) + { char* new_array_begin = allocator_traits_type::allocate(m_aloc, loop_storage_size); - char* new_array_end = new_array_begin + size() * new_stride; - char* new_array_cap = new_array_begin + loop_storage_size; + char* new_array_end = new_array_begin + size() * new_stride; + char* new_array_cap = new_array_begin + loop_storage_size; - for (size_type i = 0; i < size(); ++i) { + for (size_type i = 0; i < size(); ++i) + { move_destroy_value(new_array_begin + i * new_stride, - m_array_begin + i * m_stride); + m_array_begin + i * m_stride); } - if (m_array_begin != nullptr) { - allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity()); + if (m_array_begin != nullptr) + { + allocator_traits_type::deallocate( + m_aloc, m_array_begin, storage_capacity()); } - m_stride = new_stride ; + m_stride = new_stride; m_array_begin = new_array_begin; - m_array_end = new_array_end ; - m_array_cap = new_array_cap ; + m_array_end = new_array_end; + m_array_cap = new_array_cap; } } // destroy the loops in storage (does not deallocate loop storage) void array_clear() { - for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) { + for (size_type value_offset = storage_size(); value_offset > 0; + value_offset -= m_stride) + { destroy_value(value_offset - m_stride); m_array_end -= m_stride; } @@ -1002,18 +1016,20 @@ class WorkStorage + template void create_value(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { const size_type value_size = sizeof(true_value_type); - if (value_size > storage_unused() && value_size <= m_stride) { - array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()), + if (value_size > storage_unused() && value_size <= m_stride) + { + array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()), m_stride); - } else if (value_size > m_stride) { - array_reserve((size()+1)*value_size, - value_size); + } + else if (value_size > m_stride) + { + array_reserve((size() + 1) * value_size, value_size); } size_type value_offset = storage_size(); @@ -1025,8 +1041,7 @@ class WorkStorage(value_ptr), reinterpret_cast(other_value_ptr)); @@ -1035,14 +1050,13 @@ class WorkStorage(m_array_begin + value_offset); + pointer value_ptr = reinterpret_cast(m_array_begin + value_offset); value_type::destroy(value_ptr); } }; -} // namespace detail +} // namespace detail -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp index 72e1540c54..4ccfc5d4f5 100644 --- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp +++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp @@ -35,7 +35,7 @@ namespace detail /*! * A struct that gives a generic way to layout memory for different loops */ -template < size_t size, typename Dispatcher_T > +template struct WorkStruct; /*! @@ -44,67 +44,75 @@ struct WorkStruct; * offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct, obj) * sizeof(GenericWorkStruct) <= sizeof(WorkStruct) */ -template < typename Dispatcher_T > +template using GenericWorkStruct = WorkStruct; -template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs > -struct WorkStruct> +template +struct WorkStruct< + size, + Dispatcher> { - using dispatcher_type = Dispatcher; + using dispatcher_type = + Dispatcher; // construct a WorkStruct with a value of type holder from the args and // check a variety of constraints at compile time - template < typename holder, typename ... holder_ctor_args > - static RAJA_INLINE - void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) + template + static RAJA_INLINE void construct(void* ptr, + const dispatcher_type* dispatcher, + holder_ctor_args&&... ctor_args) { using true_value_type = WorkStruct; using value_type = GenericWorkStruct; static_assert(sizeof(holder) <= sizeof(true_value_type::obj), - "holder must fit in WorkStruct::obj"); + "holder must fit in WorkStruct::obj"); static_assert(std::is_standard_layout::value, - "WorkStruct must be a standard layout type"); + "WorkStruct must be a standard layout type"); static_assert(std::is_standard_layout::value, - "GenericWorkStruct must be a standard layout type"); + "GenericWorkStruct must be a standard layout type"); static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj), - "WorkStruct and GenericWorkStruct must have obj at the same offset"); + "WorkStruct and GenericWorkStruct must have obj at the same " + "offset"); static_assert(sizeof(value_type) <= sizeof(true_value_type), - "WorkStruct must not be smaller than GenericWorkStruct"); + "WorkStruct must not be smaller than GenericWorkStruct"); true_value_type* value_ptr = static_cast(ptr); value_ptr->dispatcher = dispatcher; value_ptr->invoke = dispatcher->invoke; - new(&value_ptr->obj) holder(std::forward(ctor_args)...); + new (&value_ptr->obj) holder(std::forward(ctor_args)...); } // move construct in dst from the value in src and destroy the value in src - static RAJA_INLINE - void move_destroy(WorkStruct* value_dst, - WorkStruct* value_src) + static RAJA_INLINE void move_destroy(WorkStruct* value_dst, + WorkStruct* value_src) { value_dst->dispatcher = value_src->dispatcher; value_dst->invoke = value_src->invoke; - value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj); + value_dst->dispatcher->move_construct_destroy(&value_dst->obj, + &value_src->obj); } // destroy the value ptr - static RAJA_INLINE - void destroy(WorkStruct* value_ptr) + static RAJA_INLINE void destroy(WorkStruct* value_ptr) { value_ptr->dispatcher->destroy(&value_ptr->obj); } // invoke the call operator of the value ptr with args - static RAJA_INLINE - void host_call(const WorkStruct* value_ptr, CallArgs... args) + static RAJA_INLINE void host_call(const WorkStruct* value_ptr, + CallArgs... args) { value_ptr->invoke(&value_ptr->obj, std::forward(args)...); } /// // invoke the call operator of the value ptr with args - static RAJA_DEVICE RAJA_INLINE - void device_call(const WorkStruct* value_ptr, CallArgs... args) + static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr, + CallArgs... args) { value_ptr->invoke(&value_ptr->obj, std::forward(args)...); } @@ -114,8 +122,8 @@ struct WorkStruct::type obj; }; -} // namespace detail +} // namespace detail -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp index d5905f7928..2a024c6db3 100644 --- a/include/RAJA/pattern/atomic.hpp +++ b/include/RAJA/pattern/atomic.hpp @@ -87,7 +87,7 @@ namespace RAJA */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc) +RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc) { return RAJA::atomicLoad(Policy{}, acc); } @@ -100,7 +100,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value) { RAJA::atomicStore(Policy{}, acc, value); } @@ -114,7 +114,7 @@ RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value) { return RAJA::atomicAdd(Policy{}, acc, value); } @@ -128,7 +128,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value) { return RAJA::atomicSub(Policy{}, acc, value); } @@ -142,7 +142,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value) { return RAJA::atomicMin(Policy{}, acc, value); } @@ -156,7 +156,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value) { return RAJA::atomicMax(Policy{}, acc, value); } @@ -169,7 +169,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc) +RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc) { return RAJA::atomicInc(Policy{}, acc); } @@ -185,7 +185,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare) +RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare) { return RAJA::atomicInc(Policy{}, acc, compare); } @@ -198,7 +198,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc) +RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc) { return RAJA::atomicDec(Policy{}, acc); } @@ -214,7 +214,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare) +RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare) { return RAJA::atomicDec(Policy{}, acc, compare); } @@ -229,7 +229,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value) { static_assert(std::is_integral::value, "atomicAnd can only be used on integral types"); @@ -246,7 +246,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value) { static_assert(std::is_integral::value, "atomicOr can only be used on integral types"); @@ -263,7 +263,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value) { static_assert(std::is_integral::value, "atomicXor can only be used on integral types"); @@ -279,7 +279,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value) { return RAJA::atomicExchange(Policy{}, acc, value); } @@ -295,7 +295,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value) RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value) { return RAJA::atomicCAS(Policy{}, acc, compare, value); } @@ -317,22 +317,18 @@ class AtomicRef RAJA_INLINE RAJA_HOST_DEVICE - constexpr explicit AtomicRef(value_type *value_ptr) - : m_value_ptr(value_ptr) {} + constexpr explicit AtomicRef(value_type* value_ptr) : m_value_ptr(value_ptr) + {} RAJA_INLINE RAJA_HOST_DEVICE - constexpr AtomicRef(AtomicRef const &c) - : m_value_ptr(c.m_value_ptr) {} + constexpr AtomicRef(AtomicRef const& c) : m_value_ptr(c.m_value_ptr) {} AtomicRef& operator=(AtomicRef const&) = delete; RAJA_INLINE RAJA_HOST_DEVICE - value_type * getPointer() const - { - return m_value_ptr; - } + value_type* getPointer() const { return m_value_ptr; } RAJA_INLINE RAJA_HOST_DEVICE @@ -351,17 +347,11 @@ class AtomicRef RAJA_INLINE RAJA_HOST_DEVICE - value_type load() const - { - return RAJA::atomicLoad(m_value_ptr); - } + value_type load() const { return RAJA::atomicLoad(m_value_ptr); } RAJA_INLINE RAJA_HOST_DEVICE - operator value_type() const - { - return RAJA::atomicLoad(m_value_ptr); - } + operator value_type() const { return RAJA::atomicLoad(m_value_ptr); } RAJA_INLINE RAJA_HOST_DEVICE @@ -383,9 +373,12 @@ class AtomicRef { value_type compare = expect; value_type old = RAJA::atomicCAS(m_value_ptr, compare, rhs); - if (compare == old) { + if (compare == old) + { return true; - } else { + } + else + { expect = old; return false; } @@ -527,10 +520,10 @@ class AtomicRef } private: - value_type *m_value_ptr; + value_type* m_value_ptr; }; -} // namespace RAJA +} // namespace RAJA #endif diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp index 21d266bd21..23ed6c462e 100644 --- a/include/RAJA/pattern/detail/algorithm.hpp +++ b/include/RAJA/pattern/detail/algorithm.hpp @@ -49,30 +49,29 @@ using ContainerVal = camp::decay>())>; template -using ContainerRef = - decltype(*camp::val>()); +using ContainerRef = decltype(*camp::val>()); template using ContainerDiff = - camp::decay>()-camp::val>())>; + camp::decay>() - + camp::val>())>; template -RAJA_INLINE -DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id) +RAJA_INLINE DiffType firstIndex(DiffType n, + CountType num_threads, + CountType thread_id) { return (static_cast(n) * thread_id) / num_threads; } -} // end namespace detail +} // end namespace detail /*! \brief swap values at iterators lhs and rhs */ template -RAJA_HOST_DEVICE RAJA_INLINE -void -safe_iter_swap(Iter lhs, Iter rhs) +RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs) { #ifdef RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE using camp::safe_swap; @@ -87,9 +86,7 @@ safe_iter_swap(Iter lhs, Iter rhs) \brief returns iterator to next item */ template -RAJA_HOST_DEVICE RAJA_INLINE -Iter -next(Iter it) +RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it) { ++it; return it; @@ -99,14 +96,12 @@ next(Iter it) \brief returns iterator to next item */ template -RAJA_HOST_DEVICE RAJA_INLINE -Iter -prev(Iter it) +RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it) { --it; return it; } -} // end namespace RAJA +} // end namespace RAJA #endif diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp index 3bd5d7ecaf..217ef0b882 100644 --- a/include/RAJA/pattern/detail/forall.hpp +++ b/include/RAJA/pattern/detail/forall.hpp @@ -19,12 +19,12 @@ #ifndef RAJA_PATTERN_DETAIL_FORALL_HPP #define RAJA_PATTERN_DETAIL_FORALL_HPP -#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \ - using std::begin; \ - using std::end; \ - using std::distance; \ - auto begin##SUFFIX = begin(CONTAINER); \ - auto end##SUFFIX = end(CONTAINER); \ +#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \ + using std::begin; \ + using std::end; \ + using std::distance; \ + auto begin##SUFFIX = begin(CONTAINER); \ + auto end##SUFFIX = end(CONTAINER); \ auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX) #define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it) diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp index 884b9aa989..0f6b7069b2 100644 --- a/include/RAJA/pattern/detail/multi_reduce.hpp +++ b/include/RAJA/pattern/detail/multi_reduce.hpp @@ -26,32 +26,29 @@ #include "RAJA/util/RepeatView.hpp" -#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA) \ - template \ - struct MultiReduce##OP_NAME, T> \ - : reduce::detail::BaseMultiReduce##OP_NAME< \ - DATA, tuning>> \ - { \ - using policy = POL; \ - using Base = reduce::detail::BaseMultiReduce##OP_NAME< \ - DATA, tuning>>; \ - using Base::Base; \ - using typename Base::value_type; \ - using typename Base::reference; \ - \ - RAJA_SUPPRESS_HD_WARN \ - RAJA_HOST_DEVICE \ - reference operator[](size_t bin) const \ - { \ - return reference(*this, bin); \ - } \ +#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA) \ + template \ + struct MultiReduce##OP_NAME, T> \ + : reduce::detail::BaseMultiReduce##OP_NAME< \ + DATA, tuning>> \ + { \ + using policy = POL; \ + using Base = reduce::detail::BaseMultiReduce##OP_NAME< \ + DATA, tuning>>; \ + using Base::Base; \ + using typename Base::value_type; \ + using typename Base::reference; \ + \ + RAJA_SUPPRESS_HD_WARN \ + RAJA_HOST_DEVICE \ + reference operator[](size_t bin) const { return reference(*this, bin); } \ }; -#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA) \ - RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA) \ - RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA) \ - RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA) \ - RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA) \ +#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA) \ RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA) namespace RAJA @@ -70,29 +67,34 @@ struct BaseMultiReduce using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp; using value_type = typename t_MultiReduceData::value_type; - BaseMultiReduce() : BaseMultiReduce{RepeatView(MultiReduceOp::identity(), 0)} {} + BaseMultiReduce() + : BaseMultiReduce{RepeatView(MultiReduceOp::identity(), 0)} + {} explicit BaseMultiReduce(size_t num_bins, value_type init_val = MultiReduceOp::identity(), value_type identity = MultiReduceOp::identity()) : BaseMultiReduce{RepeatView(init_val, num_bins), identity} - { } - - template < typename Container, - concepts::enable_if_t, - concepts::negate>, - concepts::negate>>* = nullptr > + {} + + template < + typename Container, + concepts::enable_if_t< + type_traits::is_range, + concepts::negate>, + concepts::negate>>* = + nullptr> explicit BaseMultiReduce(Container const& container, value_type identity = MultiReduceOp::identity()) : data{container, identity} - { } + {} RAJA_SUPPRESS_HD_WARN BaseMultiReduce(BaseMultiReduce const&) = default; RAJA_SUPPRESS_HD_WARN - BaseMultiReduce(BaseMultiReduce &&) = default; - BaseMultiReduce &operator=(BaseMultiReduce const&) = delete; - BaseMultiReduce &operator=(BaseMultiReduce &&) = delete; + BaseMultiReduce(BaseMultiReduce&&) = default; + BaseMultiReduce& operator=(BaseMultiReduce const&) = delete; + BaseMultiReduce& operator=(BaseMultiReduce&&) = delete; RAJA_SUPPRESS_HD_WARN ~BaseMultiReduce() = default; @@ -108,12 +110,13 @@ struct BaseMultiReduce reset(RepeatView(init_val, num_bins), identity); } - template < typename Container, - concepts::enable_if_t>* = nullptr > + template >* = nullptr> void reset(Container const& container, value_type identity = MultiReduceOp::identity()) { - for (size_t bin = 0; bin < data.num_bins(); ++bin) { + for (size_t bin = 0; bin < data.num_bins(); ++bin) + { RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset } data.reset(container, identity); @@ -125,7 +128,7 @@ struct BaseMultiReduce RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - BaseMultiReduce const& combine(size_t bin, value_type const &other) const + BaseMultiReduce const& combine(size_t bin, value_type const& other) const { data.combine(bin, other); return *this; @@ -135,16 +138,19 @@ struct BaseMultiReduce value_type get(size_t bin) const { return data.get(bin); } //! Get the calculated reduced value for each bin and store it in container - template < typename Container, - concepts::enable_if_t>* = nullptr > + template >* = nullptr> void get_all(Container& container) const { RAJA_EXTRACT_BED_IT(container); - if (size_t(distance_it) != data.num_bins()) { - RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer"); + if (size_t(distance_it) != data.num_bins()) + { + RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size " + "than multi reducer"); } size_t bin = 0; - for (auto& val : container) { + for (auto& val : container) + { val = data.get(bin); ++bin; } @@ -167,17 +173,17 @@ class BaseMultiReduceMin : public BaseMultiReduce { public: using Base = BaseMultiReduce; - using typename Base::value_type; using Base::Base; + using typename Base::value_type; RAJA_SUPPRESS_HD_WARN BaseMultiReduceMin(BaseMultiReduceMin const&) = default; RAJA_SUPPRESS_HD_WARN - BaseMultiReduceMin(BaseMultiReduceMin &&) = default; + BaseMultiReduceMin(BaseMultiReduceMin&&) = default; RAJA_SUPPRESS_HD_WARN - BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete; + BaseMultiReduceMin& operator=(BaseMultiReduceMin const&) = delete; RAJA_SUPPRESS_HD_WARN - BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete; + BaseMultiReduceMin& operator=(BaseMultiReduceMin&&) = delete; RAJA_SUPPRESS_HD_WARN ~BaseMultiReduceMin() = default; @@ -185,8 +191,8 @@ class BaseMultiReduceMin : public BaseMultiReduce { RAJA_HOST_DEVICE reference(BaseMultiReduceMin const& base, size_t bin) - : m_base(base), m_bin(bin) - { } + : m_base(base), m_bin(bin) + {} //! reducer function; updates the current instance's state RAJA_HOST_DEVICE @@ -196,10 +202,7 @@ class BaseMultiReduceMin : public BaseMultiReduce return *this; } - value_type get() const - { - return m_base.get(m_bin); - } + value_type get() const { return m_base.get(m_bin); } private: BaseMultiReduceMin const& m_base; @@ -226,9 +229,9 @@ class BaseMultiReduceMax : public BaseMultiReduce RAJA_SUPPRESS_HD_WARN BaseMultiReduceMax(BaseMultiReduceMax const&) = default; RAJA_SUPPRESS_HD_WARN - BaseMultiReduceMax(BaseMultiReduceMax &&) = default; - BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete; - BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete; + BaseMultiReduceMax(BaseMultiReduceMax&&) = default; + BaseMultiReduceMax& operator=(BaseMultiReduceMax const&) = delete; + BaseMultiReduceMax& operator=(BaseMultiReduceMax&&) = delete; RAJA_SUPPRESS_HD_WARN ~BaseMultiReduceMax() = default; @@ -236,8 +239,8 @@ class BaseMultiReduceMax : public BaseMultiReduce { RAJA_HOST_DEVICE reference(BaseMultiReduceMax const& base, size_t bin) - : m_base(base), m_bin(bin) - { } + : m_base(base), m_bin(bin) + {} //! reducer function; updates the current instance's state RAJA_HOST_DEVICE @@ -247,10 +250,7 @@ class BaseMultiReduceMax : public BaseMultiReduce return *this; } - value_type get() const - { - return m_base.get(m_bin); - } + value_type get() const { return m_base.get(m_bin); } private: BaseMultiReduceMax const& m_base; @@ -277,9 +277,9 @@ class BaseMultiReduceSum : public BaseMultiReduce RAJA_SUPPRESS_HD_WARN BaseMultiReduceSum(BaseMultiReduceSum const&) = default; RAJA_SUPPRESS_HD_WARN - BaseMultiReduceSum(BaseMultiReduceSum &&) = default; - BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete; - BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete; + BaseMultiReduceSum(BaseMultiReduceSum&&) = default; + BaseMultiReduceSum& operator=(BaseMultiReduceSum const&) = delete; + BaseMultiReduceSum& operator=(BaseMultiReduceSum&&) = delete; RAJA_SUPPRESS_HD_WARN ~BaseMultiReduceSum() = default; @@ -287,8 +287,8 @@ class BaseMultiReduceSum : public BaseMultiReduce { RAJA_HOST_DEVICE reference(BaseMultiReduceSum const& base, size_t bin) - : m_base(base), m_bin(bin) - { } + : m_base(base), m_bin(bin) + {} //! reducer function; updates the current instance's state RAJA_HOST_DEVICE @@ -298,10 +298,7 @@ class BaseMultiReduceSum : public BaseMultiReduce return *this; } - value_type get() const - { - return m_base.get(m_bin); - } + value_type get() const { return m_base.get(m_bin); } private: BaseMultiReduceSum const& m_base; @@ -328,9 +325,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce RAJA_SUPPRESS_HD_WARN BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default; RAJA_SUPPRESS_HD_WARN - BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default; - BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete; - BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete; + BaseMultiReduceBitOr(BaseMultiReduceBitOr&&) = default; + BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr const&) = delete; + BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&) = delete; RAJA_SUPPRESS_HD_WARN ~BaseMultiReduceBitOr() = default; @@ -338,8 +335,8 @@ class BaseMultiReduceBitOr : public BaseMultiReduce { RAJA_HOST_DEVICE reference(BaseMultiReduceBitOr const& base, size_t bin) - : m_base(base), m_bin(bin) - { } + : m_base(base), m_bin(bin) + {} //! reducer function; updates the current instance's state RAJA_HOST_DEVICE @@ -349,10 +346,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce return *this; } - value_type get() const - { - return m_base.get(m_bin); - } + value_type get() const { return m_base.get(m_bin); } private: BaseMultiReduceBitOr const& m_base; @@ -379,9 +373,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce RAJA_SUPPRESS_HD_WARN BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default; RAJA_SUPPRESS_HD_WARN - BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default; - BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete; - BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete; + BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&) = default; + BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd const&) = delete; + BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&) = delete; RAJA_SUPPRESS_HD_WARN ~BaseMultiReduceBitAnd() = default; @@ -389,8 +383,8 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce { RAJA_HOST_DEVICE reference(BaseMultiReduceBitAnd const& base, size_t bin) - : m_base(base), m_bin(bin) - { } + : m_base(base), m_bin(bin) + {} //! reducer function; updates the current instance's state RAJA_HOST_DEVICE @@ -400,10 +394,7 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce return *this; } - value_type get() const - { - return m_base.get(m_bin); - } + value_type get() const { return m_base.get(m_bin); } private: BaseMultiReduceBitAnd const& m_base; @@ -411,10 +402,10 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce }; }; -} // namespace detail +} // namespace detail -} // namespace reduce +} // namespace reduce -} // namespace RAJA +} // namespace RAJA #endif /* RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP */ diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp index 3579027cd3..036890e067 100644 --- a/include/RAJA/pattern/detail/privatizer.hpp +++ b/include/RAJA/pattern/detail/privatizer.hpp @@ -42,16 +42,19 @@ class has_privatizer static_assert(!has_privatizer::value, "if this fires, abandon all hope"); -struct GenericWrapperBase { -}; +struct GenericWrapperBase +{}; template -struct Privatizer { +struct Privatizer +{ using value_type = camp::decay; using reference_type = value_type&; value_type priv; static_assert(!has_privatizer::value, - "Privatizer selected inappropriately, this is almost certainly " + "Privatizer selected " + "inappropriately, this is almost " + "certainly " "a bug"); static_assert(!std::is_base_of::value, "Privatizer selected inappropriately, this is almost certainly " @@ -96,8 +99,8 @@ RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer return typename T::privatizer{item}; } -} // namespace internal +} // namespace internal -} // namespace RAJA +} // namespace RAJA #endif /* __RAJA_PRIVATIZER_HPP */ diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp index 788f3c698d..70cfbd856c 100644 --- a/include/RAJA/pattern/detail/reduce.hpp +++ b/include/RAJA/pattern/detail/reduce.hpp @@ -21,33 +21,33 @@ #include "RAJA/util/Operators.hpp" #include "RAJA/util/types.hpp" -#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER) \ - template \ - class Reduce##OP \ - : public reduce::detail::BaseReduce##OP \ - { \ - public: \ - using Base = reduce::detail::BaseReduce##OP; \ - using Base::Base; \ +#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER) \ + template \ + class Reduce##OP \ + : public reduce::detail::BaseReduce##OP \ + { \ + public: \ + using Base = reduce::detail::BaseReduce##OP; \ + using Base::Base; \ }; -#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER) \ - template \ - class Reduce##OP \ - : public reduce::detail::BaseReduce##OP \ - { \ - public: \ - using Base = reduce::detail::BaseReduce##OP; \ - using Base::Base; \ +#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER) \ + template \ + class Reduce##OP \ + : public reduce::detail::BaseReduce##OP \ + { \ + public: \ + using Base = reduce::detail::BaseReduce##OP; \ + using Base::Base; \ }; -#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER) \ - RAJA_DECLARE_REDUCER(Sum, POL, COMBINER) \ - RAJA_DECLARE_REDUCER(Min, POL, COMBINER) \ - RAJA_DECLARE_REDUCER(Max, POL, COMBINER) \ - RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER) \ - RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER) \ - RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER) \ +#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER) \ + RAJA_DECLARE_REDUCER(Sum, POL, COMBINER) \ + RAJA_DECLARE_REDUCER(Min, POL, COMBINER) \ + RAJA_DECLARE_REDUCER(Max, POL, COMBINER) \ + RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER) \ + RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER) \ + RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER) \ RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER) namespace RAJA @@ -64,39 +64,40 @@ namespace detail { template class Op> -struct op_adapter : private Op { +struct op_adapter : private Op +{ using operator_type = Op; RAJA_HOST_DEVICE static constexpr T identity() { return operator_type::identity(); } - RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val, const T v) const + RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val, const T v) const { val = operator_type::operator()(val, v); } }; -} // namespace detail +} // namespace detail template -struct sum : detail::op_adapter { -}; +struct sum : detail::op_adapter +{}; template -struct min : detail::op_adapter { -}; +struct min : detail::op_adapter +{}; template -struct max : detail::op_adapter { -}; +struct max : detail::op_adapter +{}; template -struct or_bit : detail::op_adapter { -}; +struct or_bit : detail::op_adapter +{}; template -struct and_bit : detail::op_adapter { -}; +struct and_bit : detail::op_adapter +{}; #if defined(RAJA_ENABLE_TARGET_OPENMP) @@ -107,10 +108,11 @@ namespace detail { template ::value> -struct DefaultLoc {}; +struct DefaultLoc +{}; template -struct DefaultLoc // any non-integral type +struct DefaultLoc // any non-integral type { RAJA_HOST_DEVICE constexpr T value() const { return T(); } }; @@ -128,55 +130,67 @@ class ValueLoc T val = doing_min ? operators::limits::max() : operators::limits::min(); IndexType loc = DefaultLoc().value(); -#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__) +#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || \ + defined(__HIPCC__) RAJA_HOST_DEVICE constexpr ValueLoc() {} - RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {} + RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const& other) + : val{other.val}, loc{other.loc} + {} RAJA_HOST_DEVICE - ValueLoc &operator=(ValueLoc const &other) { val = other.val; loc = other.loc; return *this;} + ValueLoc& operator=(ValueLoc const& other) + { + val = other.val; + loc = other.loc; + return *this; + } #else constexpr ValueLoc() = default; - constexpr ValueLoc(ValueLoc const &) = default; - ValueLoc &operator=(ValueLoc const &) = default; + constexpr ValueLoc(ValueLoc const&) = default; + ValueLoc& operator=(ValueLoc const&) = default; #endif - RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc().value()} {} - RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_) + RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_) + : val{val_}, loc{DefaultLoc().value()} + {} + RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_, IndexType const& loc_) : val{val_}, loc{loc_} - { - } + {} RAJA_HOST_DEVICE operator T() const { return val; } RAJA_HOST_DEVICE IndexType getLoc() { return loc; } - RAJA_HOST_DEVICE bool operator<(ValueLoc const &rhs) const + RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const { return val < rhs.val; } - RAJA_HOST_DEVICE bool operator>(ValueLoc const &rhs) const + RAJA_HOST_DEVICE bool operator>(ValueLoc const& rhs) const { return val > rhs.val; } }; -} // namespace detail +} // namespace detail -} // namespace reduce +} // namespace reduce namespace operators { template -struct limits<::RAJA::reduce::detail::ValueLoc> { - RAJA_INLINE RAJA_HOST_DEVICE static constexpr - ::RAJA::reduce::detail::ValueLoc min() +struct limits<::RAJA::reduce::detail::ValueLoc> +{ + RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail:: + ValueLoc + min() { return ::RAJA::reduce::detail::ValueLoc(limits::min()); } - RAJA_INLINE RAJA_HOST_DEVICE static constexpr - ::RAJA::reduce::detail::ValueLoc max() + RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail:: + ValueLoc + max() { return ::RAJA::reduce::detail::ValueLoc(limits::max()); } }; -} // namespace operators +} // namespace operators namespace reduce { @@ -208,8 +222,7 @@ class BaseReduce RAJA_HOST_DEVICE BaseReduce(T init_val, T identity_ = Reduce::identity()) : c{init_val, identity_} - { - } + {} RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE @@ -220,27 +233,27 @@ class BaseReduce } //! prohibit compiler-generated copy assignment - BaseReduce &operator=(const BaseReduce &) = delete; + BaseReduce& operator=(const BaseReduce&) = delete; //! compiler-generated copy constructor RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - BaseReduce(const BaseReduce ©) : c(copy.c) {} + BaseReduce(const BaseReduce& copy) : c(copy.c) {} //! compiler-generated move constructor RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE RAJA_INLINE - BaseReduce(BaseReduce &©) : c(std::move(copy.c)) {} + BaseReduce(BaseReduce&& copy) : c(std::move(copy.c)) {} //! compiler-generated move assignment - BaseReduce &operator=(BaseReduce &&) = default; + BaseReduce& operator=(BaseReduce&&) = default; RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - void combine(T const &other) const { c.combine(other); } + void combine(T const& other) const { c.combine(other); } - T &local() const { return c.local(); } + T& local() const { return c.local(); } //! Get the calculated reduced value operator T() const { return c.get(); } @@ -253,7 +266,7 @@ template class BaseCombinable { protected: - BaseCombinable const *parent = nullptr; + BaseCombinable const* parent = nullptr; T identity; T mutable my_data; @@ -266,8 +279,7 @@ class BaseCombinable RAJA_HOST_DEVICE constexpr BaseCombinable(T init_val, T identity_ = T()) : identity{identity_}, my_data{init_val} - { - } + {} RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE @@ -279,25 +291,25 @@ class BaseCombinable RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - constexpr BaseCombinable(BaseCombinable const &other) + constexpr BaseCombinable(BaseCombinable const& other) : parent{other.parent ? other.parent : &other}, identity{other.identity}, my_data{identity} - { - } + {} RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE ~BaseCombinable() { - if (parent && my_data != identity) { + if (parent && my_data != identity) + { Reduce()(parent->my_data, my_data); } } RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - void combine(T const &other) { Reduce{}(my_data, other); } + void combine(T const& other) { Reduce{}(my_data, other); } /*! * \return the calculated reduced value @@ -307,17 +319,17 @@ class BaseCombinable /*! * \return reference to the local value */ - T &local() const { return my_data; } + T& local() const { return my_data; } T get_combined() const { return my_data; } private: // Convenience method for CRTP - const Derived &derived() const + const Derived& derived() const { - return *(static_cast(this)); + return *(static_cast(this)); } - Derived &derived() { return *(static_cast(this)); } + Derived& derived() { return *(static_cast(this)); } }; /*! @@ -336,7 +348,7 @@ class BaseReduceMin : public BaseReduce //! reducer function; updates the current instance's state RAJA_HOST_DEVICE - const BaseReduceMin &min(T rhs) const + const BaseReduceMin& min(T rhs) const { this->combine(rhs); return *this; @@ -350,7 +362,10 @@ class BaseReduceMin : public BaseReduce * ************************************************************************** */ -template class Combiner> +template + class Combiner> class BaseReduceMinLoc : public BaseReduce, RAJA::reduce::min, Combiner> { @@ -362,24 +377,28 @@ class BaseReduceMinLoc constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {} - constexpr BaseReduceMinLoc(T init_val, IndexType init_idx, - T identity_val_ = reduce_type::identity(), - IndexType identity_loc_ = DefaultLoc().value()) - : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_)) - { - } - - void reset(T init_val, IndexType init_idx, + constexpr BaseReduceMinLoc( + T init_val, + IndexType init_idx, + T identity_val_ = reduce_type::identity(), + IndexType identity_loc_ = DefaultLoc().value()) + : Base(value_type(init_val, init_idx), + value_type(identity_val_, identity_loc_)) + {} + + void reset(T init_val, + IndexType init_idx, T identity_val_ = reduce_type::identity(), IndexType identity_loc_ = DefaultLoc().value()) { operator T(); // automatic get() before reset - Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_)); + Base::reset(value_type(init_val, init_idx), + value_type(identity_val_, identity_loc_)); } /// \brief reducer function; updates the current instance's state RAJA_HOST_DEVICE - const BaseReduceMinLoc &minloc(T rhs, IndexType loc) const + const BaseReduceMinLoc& minloc(T rhs, IndexType loc) const { this->combine(value_type(rhs, loc)); return *this; @@ -408,7 +427,7 @@ class BaseReduceMax : public BaseReduce //! reducer function; updates the current instance's state RAJA_HOST_DEVICE - const BaseReduceMax &max(T rhs) const + const BaseReduceMax& max(T rhs) const { this->combine(rhs); return *this; @@ -432,7 +451,7 @@ class BaseReduceSum : public BaseReduce //! reducer function; updates the current instance's state RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - const BaseReduceSum &operator+=(T rhs) const + const BaseReduceSum& operator+=(T rhs) const { this->combine(rhs); return *this; @@ -456,7 +475,7 @@ class BaseReduceBitOr : public BaseReduce //! reducer function; updates the current instance's state RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - const BaseReduceBitOr &operator|=(T rhs) const + const BaseReduceBitOr& operator|=(T rhs) const { this->combine(rhs); return *this; @@ -480,7 +499,7 @@ class BaseReduceBitAnd : public BaseReduce //! reducer function; updates the current instance's state RAJA_SUPPRESS_HD_WARN RAJA_HOST_DEVICE - const BaseReduceBitAnd &operator&=(T rhs) const + const BaseReduceBitAnd& operator&=(T rhs) const { this->combine(rhs); return *this; @@ -495,36 +514,45 @@ class BaseReduceBitAnd : public BaseReduce * ************************************************************************** */ -template class Combiner> -class BaseReduceMaxLoc - : public BaseReduce, RAJA::reduce::max, Combiner> +template + class Combiner> +class BaseReduceMaxLoc : public BaseReduce, + RAJA::reduce::max, + Combiner> { public: - using Base = BaseReduce, RAJA::reduce::max, Combiner>; + using Base = + BaseReduce, RAJA::reduce::max, Combiner>; using value_type = typename Base::value_type; using reduce_type = typename Base::reduce_type; using Base::Base; constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {} - constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx, - T identity_val_ = reduce_type::identity(), - IndexType identity_loc_ = DefaultLoc().value()) - : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_)) - { - } - - void reset(T init_val, IndexType init_idx, + constexpr BaseReduceMaxLoc( + T init_val, + IndexType init_idx, + T identity_val_ = reduce_type::identity(), + IndexType identity_loc_ = DefaultLoc().value()) + : Base(value_type(init_val, init_idx), + value_type(identity_val_, identity_loc_)) + {} + + void reset(T init_val, + IndexType init_idx, T identity_val_ = reduce_type::identity(), IndexType identity_loc_ = DefaultLoc().value()) { operator T(); // automatic get() before reset - Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_)); + Base::reset(value_type(init_val, init_idx), + value_type(identity_val_, identity_loc_)); } //! reducer function; updates the current instance's state RAJA_HOST_DEVICE - const BaseReduceMaxLoc &maxloc(T rhs, IndexType loc) const + const BaseReduceMaxLoc& maxloc(T rhs, IndexType loc) const { this->combine(value_type(rhs, loc)); return *this; @@ -537,10 +565,10 @@ class BaseReduceMaxLoc operator T() const { return Base::get(); } }; -} // namespace detail +} // namespace detail -} // namespace reduce +} // namespace reduce -} // namespace RAJA +} // namespace RAJA #endif /* RAJA_PATTERN_DETAIL_REDUCE_HPP */ diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp index 686f0e8c6b..2382f2bc78 100644 --- a/include/RAJA/pattern/forall.hpp +++ b/include/RAJA/pattern/forall.hpp @@ -98,7 +98,8 @@ namespace detail { /// Adapter to replace specific implementations for the icount variants template -struct icount_adapter { +struct icount_adapter +{ using index_type = typename std::decay::type; typename std::decay::type body; using container_type = typename std::decay::type; @@ -119,20 +120,32 @@ struct icount_adapter { } }; -struct CallForall { - template - RAJA_INLINE camp::resources::EventProxy operator()(T const&, ExecPol, Body, Res, ForallParams) const; +struct CallForall +{ + template + RAJA_INLINE camp::resources::EventProxy + operator()(T const&, ExecPol, Body, Res, ForallParams) const; }; -struct CallForallIcount { +struct CallForallIcount +{ constexpr CallForallIcount(int s); - template - RAJA_INLINE camp::resources::EventProxy operator()(T const&, ExecPol, Body, Res, ForallParams) const; + template + RAJA_INLINE camp::resources::EventProxy + operator()(T const&, ExecPol, Body, Res, ForallParams) const; const int start; }; -} // namespace detail +} // namespace detail /*! ****************************************************************************** @@ -152,12 +165,20 @@ namespace wrap * ****************************************************************************** */ -template +template RAJA_INLINE concepts::enable_if_t< RAJA::resources::EventProxy, concepts::negate>, type_traits::is_range> -forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params) +forall(Res r, + ExecutionPolicy&& p, + Container&& c, + LoopBody&& loop_body, + ForallParams&& f_params) { RAJA_FORCEINLINE_RECURSIVE return forall_impl(r, @@ -167,7 +188,10 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallPa std::forward(f_params)); } -template +template RAJA_INLINE concepts::enable_if_t< RAJA::resources::EventProxy, concepts::negate>, @@ -197,22 +221,25 @@ template RAJA_INLINE resources::EventProxy forall_Icount(Res r, - ExecutionPolicy&& p, - Container&& c, - IndexType&& icount, - LoopBody&& loop_body, - ForallParams&& f_params) + ExecutionPolicy&& p, + Container&& c, + IndexType&& icount, + LoopBody&& loop_body, + ForallParams&& f_params) { using std::begin; using std::distance; using std::end; auto range = RangeSegment(0, distance(begin(c), end(c))); - detail::icount_adapter adapted(c, - loop_body, - icount); + detail::icount_adapter adapted( + c, loop_body, icount); using policy::sequential::forall_impl; RAJA_FORCEINLINE_RECURSIVE - return forall_impl(r, std::forward(p), range, adapted, std::forward(f_params)); + return forall_impl(r, + std::forward(p), + range, + adapted, + std::forward(f_params)); } /*! @@ -230,15 +257,16 @@ template -RAJA_INLINE resources::EventProxy forall_Icount(Res r, - ExecPolicy, - const TypedIndexSet& iset, - LoopBody loop_body, - ForallParams f_params) +RAJA_INLINE resources::EventProxy +forall_Icount(Res r, + ExecPolicy, + const TypedIndexSet& iset, + LoopBody loop_body, + ForallParams f_params) { // no need for icount variant here - auto segIterRes = resources::get_resource::type::get_default(); + auto segIterRes = + resources::get_resource::type::get_default(); wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) { iset.segmentCall(segID, detail::CallForallIcount(iset.getStartingIcount(segID)), @@ -256,30 +284,35 @@ template -RAJA_INLINE resources::EventProxy forall(Res r, - ExecPolicy, - const TypedIndexSet& iset, - LoopBody loop_body, - ForallParams f_params) -{ - auto segIterRes = resources::get_resource::type::get_default(); +RAJA_INLINE resources::EventProxy +forall(Res r, + ExecPolicy, + const TypedIndexSet& iset, + LoopBody loop_body, + ForallParams f_params) +{ + auto segIterRes = + resources::get_resource::type::get_default(); wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) { - iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params); + iset.segmentCall(segID, + detail::CallForall{}, + SegmentExecPolicy(), + loop_body, + r, + f_params); }); return RAJA::resources::EventProxy(r); } -} // end namespace wrap - +} // end namespace wrap /*! ****************************************************************************** * - * \brief The RAJA::policy_by_value_interface forall functions provide an interface with - * value-based policies. It also enforces the interface and performs - * static checks as well as triggering plugins and loop body updates. + * \brief The RAJA::policy_by_value_interface forall functions provide an + *interface with value-based policies. It also enforces the interface and + *performs static checks as well as triggering plugins and loop body updates. * ****************************************************************************** */ @@ -294,11 +327,12 @@ inline namespace policy_by_value_interface * ****************************************************************************** */ -template -RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, - Res r, - IdxSet&& c, - Params&&... params) +template +RAJA_INLINE resources::EventProxy +forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params) { static_assert(type_traits::is_index_set::value, "Expected a TypedIndexSet but did not get one. Are you using " @@ -306,9 +340,10 @@ RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, auto f_params = expt::make_forall_param_pack(std::forward(params)...); auto&& loop_body = expt::get_lambda(std::forward(params)...); - //expt::check_forall_optional_args(loop_body, f_params); + // expt::check_forall_optional_args(loop_body, f_params); - util::PluginContext context{util::make_context>()}; + util::PluginContext context{ + util::make_context>()}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; @@ -318,21 +353,23 @@ RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, util::callPreLaunchPlugins(context); - RAJA::resources::EventProxy e = wrap::forall_Icount( - r, - std::forward(p), - std::forward(c), - std::move(body), - f_params); + RAJA::resources::EventProxy e = + wrap::forall_Icount(r, + std::forward(p), + std::forward(c), + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; } -template ::type > -RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, - IdxSet&& c, - LoopBody&& loop_body) +template < + typename ExecutionPolicy, + typename IdxSet, + typename LoopBody, + typename Res = typename resources::get_resource::type> +RAJA_INLINE resources::EventProxy +forall_Icount(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body) { auto r = Res::get_default(); return ::RAJA::policy_by_value_interface::forall_Icount( @@ -349,11 +386,14 @@ RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, * ****************************************************************************** */ -template -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - type_traits::is_indexset_policy> -forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params) +template +RAJA_INLINE + concepts::enable_if_t, + type_traits::is_indexset_policy> + forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params) { static_assert(type_traits::is_index_set::value, "Expected a TypedIndexSet but did not get one. Are you using " @@ -363,7 +403,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params) auto&& loop_body = expt::get_lambda(std::forward(params)...); expt::check_forall_optional_args(loop_body, f_params); - util::PluginContext context{util::make_context>()}; + util::PluginContext context{ + util::make_context>()}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; @@ -373,22 +414,24 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params) util::callPreLaunchPlugins(context); - resources::EventProxy e = wrap::forall( - r, - std::forward(p), - std::forward(c), - std::move(body), - f_params); + resources::EventProxy e = wrap::forall(r, + std::forward(p), + std::forward(c), + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; } -template ::type > -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - type_traits::is_indexset_policy> -forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body) +template < + typename ExecutionPolicy, + typename IdxSet, + typename LoopBody, + typename Res = typename resources::get_resource::type> +RAJA_INLINE + concepts::enable_if_t, + type_traits::is_indexset_policy> + forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body) { auto r = Res::get_default(); return ::RAJA::policy_by_value_interface::forall( @@ -405,12 +448,14 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body) * ****************************************************************************** */ -template ::type > -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - type_traits::is_multi_policy, - type_traits::is_range> +template < + typename ExecutionPolicy, + typename Container, + typename LoopBody, + typename Res = typename resources::get_resource::type> +RAJA_INLINE concepts::enable_if_t, + type_traits::is_multi_policy, + type_traits::is_range> forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body) { static_assert(type_traits::is_random_access_range::value, @@ -420,9 +465,9 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body) // plugins handled in multipolicy policy_invoker return forall_impl(r, - std::forward(p), - std::forward(c), - std::forward(loop_body)); + std::forward(p), + std::forward(c), + std::forward(loop_body)); } /*! @@ -438,10 +483,9 @@ template -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - type_traits::is_range, - type_traits::is_integral> +RAJA_INLINE concepts::enable_if_t, + type_traits::is_range, + type_traits::is_integral> forall_Icount(ExecutionPolicy&& p, Res r, Container&& c, @@ -452,11 +496,14 @@ forall_Icount(ExecutionPolicy&& p, static_assert(type_traits::is_random_access_range::value, "Container does not model RandomAccessIterator"); - auto f_params = expt::make_forall_param_pack(std::forward(first), std::forward(params)...); - auto&& loop_body = expt::get_lambda(std::forward(first), std::forward(params)...); - //expt::check_forall_optional_args(loop_body, f_params); + auto f_params = expt::make_forall_param_pack(std::forward(first), + std::forward(params)...); + auto&& loop_body = expt::get_lambda(std::forward(first), + std::forward(params)...); + // expt::check_forall_optional_args(loop_body, f_params); - util::PluginContext context{util::make_context>()}; + util::PluginContext context{ + util::make_context>()}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; @@ -466,22 +513,23 @@ forall_Icount(ExecutionPolicy&& p, util::callPreLaunchPlugins(context); - resources::EventProxy e = wrap::forall_Icount( - r, - std::forward(p), - std::forward(c), - icount, - std::move(body), - f_params); + resources::EventProxy e = + wrap::forall_Icount(r, + std::forward(p), + std::forward(c), + icount, + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; } -template ::type > +template < + typename ExecutionPolicy, + typename Container, + typename IndexType, + typename LoopBody, + typename Res = typename resources::get_resource::type> RAJA_INLINE concepts::enable_if_t< resources::EventProxy, type_traits::is_range, @@ -509,7 +557,10 @@ forall_Icount(ExecutionPolicy&& p, ****************************************************************************** */ -template +template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, concepts::negate>, @@ -524,7 +575,8 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params) auto&& loop_body = expt::get_lambda(std::forward(params)...); expt::check_forall_optional_args(loop_body, f_params); - util::PluginContext context{util::make_context>()}; + util::PluginContext context{ + util::make_context>()}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; @@ -534,19 +586,21 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params) util::callPreLaunchPlugins(context); - resources::EventProxy e = wrap::forall( - r, - std::forward(p), - std::forward(c), - std::move(body), - f_params); + resources::EventProxy e = wrap::forall(r, + std::forward(p), + std::forward(c), + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; } -template ::type > +template < + typename ExecutionPolicy, + typename Container, + typename LoopBody, + typename Res = typename resources::get_resource::type> RAJA_INLINE concepts::enable_if_t< resources::EventProxy, concepts::negate>, @@ -562,7 +616,7 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body) std::forward(loop_body)); } -} // end inline namespace policy_by_value_interface +} // namespace policy_by_value_interface /*! @@ -570,8 +624,10 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body) * * this reduces implementation overhead and perfectly forwards all arguments */ -template ::type > +template < + typename ExecutionPolicy, + typename... Args, + typename Res = typename resources::get_resource::type> RAJA_INLINE resources::EventProxy forall(Args&&... args) { Res r = Res::get_default(); @@ -579,7 +635,8 @@ RAJA_INLINE resources::EventProxy forall(Args&&... args) ExecutionPolicy(), r, std::forward(args)...); } template -RAJA_INLINE concepts::enable_if_t, type_traits::is_resource> +RAJA_INLINE concepts::enable_if_t, + type_traits::is_resource> forall(Res r, Args&&... args) { return ::RAJA::policy_by_value_interface::forall( @@ -592,8 +649,10 @@ forall(Res r, Args&&... args) * * this reduces implementation overhead and perfectly forwards all arguments */ -template ::type > +template < + typename ExecutionPolicy, + typename... Args, + typename Res = typename resources::get_resource::type> RAJA_INLINE resources::EventProxy forall_Icount(Args&&... args) { Res r = Res::get_default(); @@ -601,7 +660,8 @@ RAJA_INLINE resources::EventProxy forall_Icount(Args&&... args) ExecutionPolicy(), r, std::forward(args)...); } template -RAJA_INLINE concepts::enable_if_t, type_traits::is_resource> +RAJA_INLINE concepts::enable_if_t, + type_traits::is_resource> forall_Icount(Res r, Args&&... args) { return ::RAJA::policy_by_value_interface::forall_Icount( @@ -611,12 +671,17 @@ forall_Icount(Res r, Args&&... args) namespace detail { -template -RAJA_INLINE camp::resources::EventProxy CallForall::operator()(T const& segment, - ExecutionPolicy, - LoopBody body, - Res r, - ForallParams f_params) const +template +RAJA_INLINE camp::resources::EventProxy +CallForall::operator()(T const& segment, + ExecutionPolicy, + LoopBody body, + Res r, + ForallParams f_params) const { // this is only called inside a region, use impl using policy::sequential::forall_impl; @@ -626,18 +691,24 @@ RAJA_INLINE camp::resources::EventProxy CallForall::operator()(T const& seg constexpr CallForallIcount::CallForallIcount(int s) : start(s) {} -template -RAJA_INLINE camp::resources::EventProxy CallForallIcount::operator()(T const& segment, - ExecutionPolicy, - LoopBody body, - Res r, - ForallParams f_params) const +template +RAJA_INLINE camp::resources::EventProxy +CallForallIcount::operator()(T const& segment, + ExecutionPolicy, + LoopBody body, + Res r, + ForallParams f_params) const { // go through wrap to unwrap icount - return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params); + return wrap::forall_Icount( + r, ExecutionPolicy(), segment, start, body, f_params); } -} // namespace detail +} // namespace detail // // Experimental support for dynamic policy selection @@ -650,104 +721,116 @@ RAJA_INLINE camp::resources::EventProxy CallForallIcount::operator()(T cons namespace expt { - template - struct dynamic_helper +template +struct dynamic_helper +{ + template + static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body) { - template - static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body) + if (IDX == pol) { - if(IDX==pol){ - using t_pol = typename camp::at>::type; - RAJA::forall(seg, body); - return; - } - dynamic_helper::invoke_forall(pol, seg, body); + using t_pol = typename camp::at>::type; + RAJA::forall(seg, body); + return; } + dynamic_helper::invoke_forall(pol, seg, body); + } - template - static resources::EventProxy - invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) - { - - using t_pol = typename camp::at>::type; - using resource_type = typename resources::get_resource::type; + template + static resources::EventProxy + invoke_forall(RAJA::resources::Resource r, + const int pol, + SEGMENT const& seg, + BODY const& body) + { - if(IDX==pol){ - RAJA::forall(r.get(), seg, body); + using t_pol = typename camp::at>::type; + using resource_type = typename resources::get_resource::type; - //Return a generic event proxy from r, - //because forall returns a typed event proxy - return {r}; - } + if (IDX == pol) + { + RAJA::forall(r.get(), seg, body); - return dynamic_helper::invoke_forall(r, pol, seg, body); + // Return a generic event proxy from r, + // because forall returns a typed event proxy + return {r}; } - }; + return dynamic_helper::invoke_forall( + r, pol, seg, body); + } +}; - template - struct dynamic_helper<0, POLICY_LIST> +template +struct dynamic_helper<0, POLICY_LIST> +{ + template + static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body) { - template - static void - invoke_forall(const int pol, SEGMENT const &seg, BODY const &body) + if (0 == pol) { - if(0==pol){ - using t_pol = typename camp::at>::type; - RAJA::forall(seg, body); - return; - } - RAJA_ABORT_OR_THROW("Policy enum not supported "); + using t_pol = typename camp::at>::type; + RAJA::forall(seg, body); + return; } + RAJA_ABORT_OR_THROW("Policy enum not supported "); + } - template - static resources::EventProxy - invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) - { - if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range "); + template + static resources::EventProxy + invoke_forall(RAJA::resources::Resource r, + const int pol, + SEGMENT const& seg, + BODY const& body) + { + if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range "); - using t_pol = typename camp::at>::type; - using resource_type = typename resources::get_resource::type; + using t_pol = typename camp::at>::type; + using resource_type = typename resources::get_resource::type; - RAJA::forall(r.get(), seg, body); + RAJA::forall(r.get(), seg, body); - //Return a generic event proxy from r, - //because forall returns a typed event proxy - return {r}; - } + // Return a generic event proxy from r, + // because forall returns a typed event proxy + return {r}; + } +}; - }; +template +void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body) +{ + constexpr int N = camp::size::value; + static_assert(N > 0, "RAJA policy list must not be empty"); - template - void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body) + if (pol > N - 1) { - constexpr int N = camp::size::value; - static_assert(N > 0, "RAJA policy list must not be empty"); - - if(pol > N-1) { - RAJA_ABORT_OR_THROW("Policy enum not supported"); - } - dynamic_helper::invoke_forall(pol, seg, body); + RAJA_ABORT_OR_THROW("Policy enum not supported"); } + dynamic_helper::invoke_forall(pol, seg, body); +} - template - resources::EventProxy - dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) - { - constexpr int N = camp::size::value; - static_assert(N > 0, "RAJA policy list must not be empty"); - - if(pol > N-1) { - RAJA_ABORT_OR_THROW("Policy value out of range"); - } +template +resources::EventProxy +dynamic_forall(RAJA::resources::Resource r, + const int pol, + SEGMENT const& seg, + BODY const& body) +{ + constexpr int N = camp::size::value; + static_assert(N > 0, "RAJA policy list must not be empty"); - return dynamic_helper::invoke_forall(r, pol, seg, body); + if (pol > N - 1) + { + RAJA_ABORT_OR_THROW("Policy value out of range"); } -} // namespace expt + return dynamic_helper::invoke_forall(r, pol, seg, body); +} + +} // namespace expt -} // namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp index 1875fe27d9..da1e6f6be7 100644 --- a/include/RAJA/pattern/kernel.hpp +++ b/include/RAJA/pattern/kernel.hpp @@ -55,37 +55,36 @@ template struct IterableWrapperTuple; template -struct IterableWrapperTuple> { +struct IterableWrapperTuple> +{ - using type = - camp::tuple::iterator, - typename camp::decay::IndexType>...>; + using type = camp::tuple::iterator, + typename camp::decay::IndexType>...>; }; namespace internal { template -RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t, - camp::idx_seq) - -> camp::tuple>>::iterator, - typename camp::decay< - camp::tuple_element_t>>::IndexType>...> +RAJA_INLINE constexpr auto +make_wrapped_tuple_impl(Tuple&& t, camp::idx_seq) -> camp::tuple< + RAJA::Span>>::iterator, + typename camp::decay< + camp::tuple_element_t>>::IndexType>...> { return camp::make_tuple( - RAJA::Span< - typename camp::decay< - camp::tuple_element_t>>::iterator, - typename camp::decay>>:: - IndexType>{camp::get(std::forward(t)).begin(), - camp::get(std::forward(t)).end()}...); + RAJA::Span>>::iterator, + typename camp::decay< + camp::tuple_element_t>>::IndexType>{ + camp::get(std::forward(t)).begin(), + camp::get(std::forward(t)).end()}...); } -} // namespace internal +} // namespace internal template -RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple &&t) +RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t) -> decltype(internal::make_wrapped_tuple_impl( std::forward(t), camp::make_idx_seq_t>::value>{})) @@ -101,10 +100,11 @@ template -RAJA_INLINE resources::EventProxy kernel_param_resource(SegmentTuple &&segments, - ParamTuple &¶ms, - Resource resource, - Bodies &&... bodies) +RAJA_INLINE resources::EventProxy +kernel_param_resource(SegmentTuple&& segments, + ParamTuple&& params, + Resource resource, + Bodies&&... bodies) { util::PluginContext context{util::make_context()}; @@ -131,11 +131,11 @@ RAJA_INLINE resources::EventProxy kernel_param_resource(SegmentTuple & // our segments, loop bodies, and the tuple of loop indices // it is passed through all of the kernel mechanics by-referenece, // and only copied to provide thread-private instances. - loop_data_t loop_data(make_wrapped_tuple( - std::forward(segments)), - std::forward(params), - resource, - std::forward(bodies)...); + loop_data_t loop_data( + make_wrapped_tuple(std::forward(segments)), + std::forward(params), + resource, + std::forward(bodies)...); util::callPostCapturePlugins(context); @@ -156,44 +156,45 @@ template -RAJA_INLINE resources::EventProxy kernel_resource(SegmentTuple &&segments, - Resource resource, - Bodies &&... bodies) +RAJA_INLINE resources::EventProxy +kernel_resource(SegmentTuple&& segments, Resource resource, Bodies&&... bodies) { - return RAJA::kernel_param_resource(std::forward(segments), - RAJA::make_tuple(), - resource, - std::forward(bodies)...); + return RAJA::kernel_param_resource( + std::forward(segments), + RAJA::make_tuple(), + resource, + std::forward(bodies)...); } template -RAJA_INLINE resources::EventProxy> kernel_param(SegmentTuple &&segments, - ParamTuple &¶ms, - Bodies &&... bodies) +RAJA_INLINE resources::EventProxy> +kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies) { auto res = resources::get_default_resource(); - return RAJA::kernel_param_resource(std::forward(segments), - std::forward(params), - res, - std::forward(bodies)...); + return RAJA::kernel_param_resource( + std::forward(segments), + std::forward(params), + res, + std::forward(bodies)...); } template -RAJA_INLINE resources::EventProxy> kernel(SegmentTuple &&segments, - Bodies &&... bodies) +RAJA_INLINE resources::EventProxy> +kernel(SegmentTuple&& segments, Bodies&&... bodies) { auto res = resources::get_default_resource(); - return RAJA::kernel_param_resource(std::forward(segments), - RAJA::make_tuple(), - res, - std::forward(bodies)...); + return RAJA::kernel_param_resource( + std::forward(segments), + RAJA::make_tuple(), + res, + std::forward(bodies)...); } -} // end namespace RAJA +} // end namespace RAJA #include "RAJA/pattern/kernel/Collapse.hpp" diff --git a/include/RAJA/pattern/kernel/Collapse.hpp b/include/RAJA/pattern/kernel/Collapse.hpp index 8efb126397..095ad402ef 100644 --- a/include/RAJA/pattern/kernel/Collapse.hpp +++ b/include/RAJA/pattern/kernel/Collapse.hpp @@ -29,12 +29,12 @@ namespace statement template struct Collapse : public internal::ForList, public internal::CollapseBase, - public internal::Statement { -}; + public internal::Statement +{}; -} // namespace statement -} // end namespace RAJA +} // namespace statement +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp index 6b7875c4c2..450fecfd5d 100644 --- a/include/RAJA/pattern/kernel/Conditional.hpp +++ b/include/RAJA/pattern/kernel/Conditional.hpp @@ -37,8 +37,8 @@ namespace statement * */ template -struct If : public internal::Statement { -}; +struct If : public internal::Statement +{}; /*! @@ -46,10 +46,11 @@ struct If : public internal::Statement { * */ template -struct Value { +struct Value +{ template - RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const &) + RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const&) { return value; } @@ -60,10 +61,11 @@ struct Value { * */ template -struct Equals { +struct Equals +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) == R::eval(data); } @@ -74,10 +76,11 @@ struct Equals { * */ template -struct NotEquals { +struct NotEquals +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) != R::eval(data); } @@ -89,10 +92,11 @@ struct NotEquals { * */ template -struct Or { +struct Or +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) || R::eval(data); } @@ -104,10 +108,11 @@ struct Or { * */ template -struct And { +struct And +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) && R::eval(data); } @@ -119,10 +124,11 @@ struct And { * */ template -struct LessThan { +struct LessThan +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) < R::eval(data); } @@ -134,10 +140,11 @@ struct LessThan { * */ template -struct LessThanEq { +struct LessThanEq +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) <= R::eval(data); } @@ -149,10 +156,11 @@ struct LessThanEq { * */ template -struct GreaterThan { +struct GreaterThan +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) > R::eval(data); } @@ -164,10 +172,11 @@ struct GreaterThan { * */ template -struct GreaterThanEq { +struct GreaterThanEq +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return L::eval(data) >= R::eval(data); } @@ -179,31 +188,34 @@ struct GreaterThanEq { * */ template -struct Not { +struct Not +{ template - RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data) { return !(L::eval(data)); } }; -} // end namespace statement +} // end namespace statement namespace internal { template -struct StatementExecutor, Types> { +struct StatementExecutor, Types> +{ template - static RAJA_INLINE void exec(Data &&data) + static RAJA_INLINE void exec(Data&& data) { - if (Condition::eval(data)) { + if (Condition::eval(data)) + { execute_statement_list, Types>( std::forward(data)); } @@ -211,8 +223,8 @@ struct StatementExecutor, Types> { }; -} // namespace internal -} // end namespace RAJA +} // namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp index 539c451673..71b6bd3009 100644 --- a/include/RAJA/pattern/kernel/For.hpp +++ b/include/RAJA/pattern/kernel/For.hpp @@ -42,14 +42,15 @@ template struct For : public internal::ForList, public internal::ForTraitBase, - public internal::Statement { + public internal::Statement +{ // TODO: add static_assert for valid policy in Pol using execution_policy_t = ExecPolicy; }; -} // end namespace statement +} // end namespace statement namespace internal { @@ -59,8 +60,12 @@ namespace internal * Assigns the loop index to offset ArgumentId * */ -template -struct ForWrapper : public GenericWrapper { +template +struct ForWrapper : public GenericWrapper +{ using Base = GenericWrapper; using Base::Base; @@ -85,11 +90,13 @@ template struct StatementExecutor< - statement::For, Types> { + statement::For, + Types> +{ template - static RAJA_INLINE void exec(Data &&data) + static RAJA_INLINE void exec(Data&& data) { // Set the argument type for this loop @@ -103,7 +110,11 @@ struct StatementExecutor< auto r = data.res; - forall_impl(r, ExecPolicy{}, TypedRangeSegment(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack()); + forall_impl(r, + ExecPolicy{}, + TypedRangeSegment(0, len), + for_wrapper, + RAJA::expt::get_empty_forall_param_pack()); } }; @@ -112,15 +123,14 @@ struct StatementExecutor< * * */ -template -struct StatementExecutor< - statement::For, Types> { +template +struct StatementExecutor, + Types> +{ template - static RAJA_INLINE void exec(Data &&data) + static RAJA_INLINE void exec(Data&& data) { // Set the argument type for this loop @@ -134,15 +144,16 @@ struct StatementExecutor< RAJA_EXTRACT_BED_IT(TypedRangeSegment(0, len)); - for (decltype(distance_it) i = 0; i < distance_it; ++i) { + for (decltype(distance_it) i = 0; i < distance_it; ++i) + { for_wrapper(*(begin_it + i)); } } }; -} // namespace internal -} // end namespace RAJA +} // namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_For_HPP */ diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp index 18515c7f59..9dfd2ca126 100644 --- a/include/RAJA/pattern/kernel/ForICount.hpp +++ b/include/RAJA/pattern/kernel/ForICount.hpp @@ -44,8 +44,9 @@ template struct ForICount : public internal::ForList, - public internal::ForTraitBase, - public internal::Statement { + public internal::ForTraitBase, + public internal::Statement +{ static_assert(std::is_base_of::value, "Inappropriate ParamId, ParamId must be of type " @@ -54,7 +55,7 @@ struct ForICount : public internal::ForList, using execution_policy_t = ExecPolicy; }; -} // end namespace statement +} // end namespace statement namespace internal { @@ -64,9 +65,13 @@ namespace internal * Assigns the loop index to offset ArgumentId * Assigns the loop index to param ParamId */ -template -struct ForICountWrapper : public GenericWrapper { +struct ForICountWrapper : public GenericWrapper +{ using Base = GenericWrapper; using Base::Base; @@ -93,32 +98,38 @@ template struct StatementExecutor< - statement::ForICount, Types> { + statement::ForICount, + Types> +{ template - static RAJA_INLINE void exec(Data &&data) + static RAJA_INLINE void exec(Data&& data) { // Set the argument type for this loop using NewTypes = setSegmentTypeFromData; // Create a wrapper, just in case forall_impl needs to thread_privatize - ForICountWrapper for_wrapper(data); + ForICountWrapper + for_wrapper(data); auto len = segment_length(data); using len_t = decltype(len); auto r = resources::get_resource::type::get_default(); - forall_impl(r, ExecPolicy{}, TypedRangeSegment(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack()); + forall_impl(r, + ExecPolicy{}, + TypedRangeSegment(0, len), + for_wrapper, + RAJA::expt::get_empty_forall_param_pack()); } }; -} // namespace internal -} // end namespace RAJA +} // namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_nested_HPP */ diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp index 955afcecc0..78a2383e43 100644 --- a/include/RAJA/pattern/kernel/Hyperplane.hpp +++ b/include/RAJA/pattern/kernel/Hyperplane.hpp @@ -81,21 +81,18 @@ template -struct Hyperplane - : public internal::Statement { -}; +struct Hyperplane : public internal::Statement +{}; -} // end namespace statement +} // end namespace statement namespace internal { template -struct HyperplaneInner - : public internal::Statement { -}; +struct HyperplaneInner : public internal::Statement +{}; template , ExecPolicy, - EnclosedStmts...>, Types> { + EnclosedStmts...>, + Types> +{ template - static RAJA_INLINE void exec(Data &data) + static RAJA_INLINE void exec(Data& data) { // get type of Hp arguments index @@ -135,9 +134,9 @@ struct StatementExecutor(data) + - foldl(RAJA::operators::plus(), - segment_length(data)...); + idx_t hp_len = + segment_length(data) + + foldl(RAJA::operators::plus(), segment_length(data)...); /* Execute the outer loop over hyperplanes * @@ -146,7 +145,8 @@ struct StatementExecutor::type::get_default(); - forall_impl(r, HpExecPolicy{}, + forall_impl(r, + HpExecPolicy{}, TypedRangeSegment(0, hp_len), outer_wrapper, RAJA::expt::get_empty_forall_param_pack()); @@ -159,11 +159,13 @@ template struct StatementExecutor< - HyperplaneInner, EnclosedStmts...>, Types> { + HyperplaneInner, EnclosedStmts...>, + Types> +{ template - static RAJA_INLINE void exec(Data &data) + static RAJA_INLINE void exec(Data& data) { // get h value @@ -173,13 +175,14 @@ struct StatementExecutor< // compute actual iterate for HpArgumentId // as: i0 = h - (i1 + i2 + i3 + ...) idx_t i = h - foldl(RAJA::operators::plus(), - camp::get(data.offset_tuple)...); + camp::get(data.offset_tuple)...); // get length of Hp indexed argument auto len = segment_length(data); // check bounds - if (i >= 0 && i < len) { + if (i >= 0 && i < len) + { // store in tuple data.template assign_offset(i); @@ -194,8 +197,8 @@ struct StatementExecutor< }; -} // end namespace internal +} // end namespace internal -} // end namespace RAJA +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp index 21d9e3cd2a..b8cb6208d3 100644 --- a/include/RAJA/pattern/kernel/InitLocalMem.hpp +++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp @@ -26,7 +26,7 @@ namespace RAJA { -//Policies for RAJA local arrays +// Policies for RAJA local arrays struct cpu_tile_mem; @@ -43,43 +43,51 @@ namespace statement * IntiLocalMem, statements...> * Will intialize the 0th array in the param tuple */ -template -struct InitLocalMem : public internal::Statement { -}; +template +struct InitLocalMem : public internal::Statement +{}; -//Policy Specialization -template -struct InitLocalMem, EnclosedStmts...> : public internal::Statement { -}; +// Policy Specialization +template +struct InitLocalMem, + EnclosedStmts...> : public internal::Statement +{}; -} // end namespace statement +} // end namespace statement namespace internal { -//Statement executor to initalize RAJA local array -template -struct StatementExecutor, EnclosedStmts...>, Types>{ - - //Execute statement list - template - static void RAJA_INLINE exec_expanded(Data && data) +// Statement executor to initalize RAJA local array +template +struct StatementExecutor, + EnclosedStmts...>, + Types> +{ + + // Execute statement list + template + static void RAJA_INLINE exec_expanded(Data&& data) { execute_statement_list, Types>(data); } - - //Intialize local array - //Identifies type + number of elements needed - template - static void RAJA_INLINE exec_expanded(Data && data) + + // Intialize local array + // Identifies type + number of elements needed + template + static void RAJA_INLINE exec_expanded(Data&& data) { - using varType = typename camp::tuple_element_t::param_tuple_t>::value_type; + using varType = typename camp::tuple_element_t< + Pos, + typename camp::decay::param_tuple_t>::value_type; // Initialize memory #ifdef RAJA_COMPILER_MSVC // MSVC doesn't like taking a pointer to stack allocated data?!?! - varType *ptr = new varType[camp::get(data.param_tuple).size()]; + varType* ptr = new varType[camp::get(data.param_tuple).size()]; camp::get(data.param_tuple).set_data(ptr); #else varType Array[camp::get(data.param_tuple).size()]; @@ -95,21 +103,19 @@ struct StatementExecutor - static RAJA_INLINE void exec(Data &&data) + + template + static RAJA_INLINE void exec(Data&& data) { - //Initalize local arrays + execute statements + cleanup + // Initalize local arrays + execute statements + cleanup exec_expanded(data); } - }; -} // namespace internal -} // end namespace RAJA +} // namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp index 29d41b431e..69e8bd7f8c 100644 --- a/include/RAJA/pattern/kernel/Lambda.hpp +++ b/include/RAJA/pattern/kernel/Lambda.hpp @@ -46,28 +46,28 @@ struct lambda_arg_param_t struct lambda_arg_offset_t {}; -template +template struct lambda_arg_value_t { - using type = T; + using type = T; }; -template +template struct LambdaArg { - static constexpr camp::idx_t value = V; + static constexpr camp::idx_t value = V; }; -} - +} // namespace internal /*! * Used in RAJA::statement::Lambda to specify that one or more segment values * should be passed into the lambda as an argument */ -template -using Segs = camp::list...>; +template +using Segs = + camp::list...>; /*! * Used in RAJA::statement::Lambda to specify that one or more segment offsets @@ -79,16 +79,18 @@ using Segs = camp::list... * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the * current tile. */ -template -using Offsets = camp::list...>; +template +using Offsets = + camp::list...>; /*! * Used in RAJA::statement::Lambda to specify that one or more parameters that * should be passed into the lambda as an argument. */ -template -using Params = camp::list...>; +template +using Params = + camp::list...>; /*! * Used in RAJA::statement::Lambda to specify that one or more constant values @@ -103,8 +105,9 @@ using Params = camp::list> * invokes: lambda0( (double)3, (double) 4 ) */ -template -using ValuesT = camp::list, values>...>; +template +using ValuesT = + camp::list, values>...>; namespace statement @@ -119,24 +122,18 @@ namespace statement * RAJA::kernel(make_tuple{s0, s1, s2}, lambda0, lambda1); * */ -template -struct Lambda : internal::Statement { +template +struct Lambda : internal::Statement +{ static const camp::idx_t loop_body_index = BodyIdx; }; -} // end namespace statement +} // end namespace statement namespace internal { - - - - - - - /* * Helper that extracts a segment value for a lambda argument * @@ -146,26 +143,25 @@ namespace internal * This class allows specialization on the segment type in LoopTypes so that * fancier constructions can happen (ie vector_exec, etc.) */ -template +template struct LambdaSegExtractor { static_assert(!std::is_same::value, - "Segment not assigned, but used in Lambda with Segs<> argument"); + "Segment not " + "assigned, but used " + "in Lambda with " + "Segs<> argument"); - template - RAJA_HOST_DEVICE - RAJA_INLINE - constexpr - static SegmentType extract(Data &&data) + template + RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data) { - return SegmentType(camp::get(data.segment_tuple).begin()[camp::get(data.offset_tuple)]); + return SegmentType(camp::get(data.segment_tuple) + .begin()[camp::get(data.offset_tuple)]); } - }; - /* * Helper that extracts a segment value for a lambda argument * @@ -175,26 +171,24 @@ struct LambdaSegExtractor * This class allows specialization on the segment type in LoopTypes so that * fancier constructions can happen (ie vector_exec, etc.) */ -template +template struct LambdaOffsetExtractor { static_assert(!std::is_same::value, - "Segment not assigned, but used in Lambda with Offsets<> argument"); + "Segment not assigned, " + "but used in Lambda " + "with Offsets<> " + "argument"); - template - RAJA_HOST_DEVICE - RAJA_INLINE - constexpr - static OffsetType extract(Data &&data) + template + RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data) { return OffsetType(camp::get(data.offset_tuple)); } - }; - /* * Helper that provides first level of argument extraction * This acts as a switchboard between Segs, Offsets, and Params @@ -202,129 +196,134 @@ struct LambdaOffsetExtractor * It calls LambdaArgExtractor to perform the actual argument extraction. * This allows LambdaArgExtractor to be specialized */ -template +template struct LambdaArgSwitchboard; -template +template struct LambdaArgSwitchboard> { using OffsetType = camp::at_v; static_assert(!std::is_same::value, - "Offset not assigned, but used in Lambda with Offsets<> argument"); + "Offset not assigned, " + "but used in Lambda " + "with Offsets<> " + "argument"); - template - RAJA_HOST_DEVICE - RAJA_INLINE - constexpr - static OffsetType extract(Data &&data) + template + RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data) { - return LambdaOffsetExtractor::extract(std::forward(data)); + return LambdaOffsetExtractor::extract( + std::forward(data)); } - }; -template +template struct LambdaArgSwitchboard> { using SegmentType = camp::at_v; static_assert(!std::is_same::value, - "Segment not assigned, but used in Lambda with Segs<> argument"); + "Segment not " + "assigned, but used " + "in Lambda with " + "Segs<> argument"); - template - RAJA_HOST_DEVICE - RAJA_INLINE - constexpr - static SegmentType extract(Data &&data) + template + RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data) { - return LambdaSegExtractor::extract(std::forward(data)); + return LambdaSegExtractor::extract( + std::forward(data)); } - }; -template +template struct LambdaArgSwitchboard> { - template - RAJA_HOST_DEVICE - RAJA_INLINE - constexpr - static auto extract(Data &&data)-> - typename std::add_lvalue_reference::param_tuple_t>>::type + template + RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto + extract(Data&& data) -> typename std::add_lvalue_reference< + camp::tuple_element_t::param_tuple_t>>::type { return camp::get(data.param_tuple); } }; -template +template struct LambdaArgSwitchboard, value>> { - template - RAJA_HOST_DEVICE - RAJA_INLINE - constexpr - static T extract(Data &&) + template + RAJA_HOST_DEVICE RAJA_INLINE constexpr static T extract(Data&&) { return T(value); } }; - RAJA_SUPPRESS_HD_WARN -template -RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data, - camp::list const &) +template +RAJA_INLINE RAJA_HOST_DEVICE void +invoke_lambda_with_args(Data&& data, camp::list const&) { camp::get(data.bodies)( LambdaArgSwitchboard::extract(data)...); } - - /*! * A RAJA::kernel statement that invokes a lambda function * with user specified arguments. */ -template -struct StatementExecutor, Types> { +template +struct StatementExecutor, Types> +{ template - static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data) + static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data) { - //Convert SegList, ParamList into Seg, Param types, and store in a list + // Convert SegList, ParamList into Seg, Param types, and store in a list using targList = typename camp::flatten>::type; - invoke_lambda_with_args(std::forward(data), targList{}); + invoke_lambda_with_args(std::forward(data), + targList{}); } }; - -template -RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data, camp::idx_seq const &, camp::idx_seq const &) +template +RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data, + camp::idx_seq const&, + camp::idx_seq const&) { using AllSegs = Segs; using AllParams = Params; // invoke the expanded Lambda executor, passing in all segments and params - StatementExecutor, Types>::exec(std::forward(data)); + StatementExecutor, + Types>::exec(std::forward(data)); } template -struct StatementExecutor, Types> { +struct StatementExecutor, Types> +{ template - static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data) + static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data) { using Data_t = camp::decay; @@ -335,14 +334,13 @@ struct StatementExecutor, Types> { std::forward(data), camp::make_idx_seq_t::value>{}, camp::make_idx_seq_t::value>{}); - } }; -} // namespace internal +} // namespace internal -} // end namespace RAJA +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp index 8e870ebe15..60972754e0 100644 --- a/include/RAJA/pattern/kernel/Param.hpp +++ b/include/RAJA/pattern/kernel/Param.hpp @@ -31,10 +31,10 @@ namespace RAJA namespace internal { -struct ParamBase { -}; +struct ParamBase +{}; -}// end namespace internal +} // end namespace internal namespace statement { @@ -47,20 +47,21 @@ namespace statement * RAJA::kernel execution policies. */ template -struct Param : public internal::ParamBase { +struct Param : public internal::ParamBase +{ constexpr static camp::idx_t param_idx = ParamId; template - RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const &data) + RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const& data) -> decltype(camp::get(data.param_tuple)) { return camp::get(data.param_tuple); } }; -} // end namespace statement -} // end namespace RAJA +} // end namespace statement +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp index 4de4922ea3..ec1835e75d 100644 --- a/include/RAJA/pattern/kernel/Reduce.hpp +++ b/include/RAJA/pattern/kernel/Reduce.hpp @@ -39,10 +39,12 @@ namespace statement * */ template class ReduceOperator, + template + class ReduceOperator, typename ParamId, typename... EnclosedStmts> -struct Reduce : public internal::Statement { +struct Reduce : public internal::Statement +{ static_assert(std::is_base_of::value, "Inappropriate ParamId, ParamId must be of type " @@ -52,10 +54,10 @@ struct Reduce : public internal::Statement { }; -} // end namespace statement +} // end namespace statement -} // end namespace RAJA +} // end namespace RAJA #endif /* RAJA_pattern_kernel_Reduce_HPP */ diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp index 82b79ae775..7acb322494 100644 --- a/include/RAJA/pattern/kernel/Region.hpp +++ b/include/RAJA/pattern/kernel/Region.hpp @@ -30,37 +30,38 @@ namespace RAJA namespace statement { -template -struct Region : public internal::Statement { -}; +template +struct Region : public internal::Statement +{}; -} // end namespace statement +} // end namespace statement namespace internal { -//Statement executor to create a region within kernel - -//Note: RAJA region's lambda must capture by reference otherwise -//internal function calls are undefined. -template -struct StatementExecutor, Types> { +// Statement executor to create a region within kernel -template -static RAJA_INLINE void exec(Data &&data) +// Note: RAJA region's lambda must capture by reference otherwise +// internal function calls are undefined. +template +struct StatementExecutor, + Types> { - RAJA::region([&]() { + template + static RAJA_INLINE void exec(Data&& data) + { + + RAJA::region([&]() { using data_t = camp::decay; execute_statement_list, Types>(data_t(data)); }); -} - + } }; -} // namespace internal -} // end namespace RAJA +} // namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp index 43f72e0545..86cfcb4345 100644 --- a/include/RAJA/pattern/kernel/Tile.hpp +++ b/include/RAJA/pattern/kernel/Tile.hpp @@ -34,14 +34,13 @@ namespace RAJA { -struct TileSize { +struct TileSize +{ const camp::idx_t size; RAJA_HOST_DEVICE RAJA_INLINE - constexpr TileSize(camp::idx_t size_) : size{size_} - { - } + constexpr TileSize(camp::idx_t size_) : size{size_} {} }; namespace statement @@ -56,26 +55,28 @@ template -struct Tile : public internal::Statement { +struct Tile : public internal::Statement +{ using tile_policy_t = TilePolicy; using exec_policy_t = ExecPolicy; }; -} // end namespace statement +} // end namespace statement ///! tag for a tiling loop template -struct tile_fixed { +struct tile_fixed +{ static constexpr camp::idx_t chunk_size = chunk_size_; }; template -struct tile_dynamic { +struct tile_dynamic +{ static constexpr camp::idx_t id = ArgumentId; }; - namespace internal { @@ -84,8 +85,12 @@ namespace internal * Assigns the tile segment to segment ArgumentId * */ -template -struct TileWrapper : public GenericWrapper { +template +struct TileWrapper : public GenericWrapper +{ using Base = GenericWrapper; using Base::Base; @@ -104,7 +109,8 @@ struct TileWrapper : public GenericWrapper { template -struct IterableTiler { +struct IterableTiler +{ using value_type = camp::decay; struct iterate @@ -122,16 +128,15 @@ struct IterableTiler { public: using value_type = iterate; using difference_type = camp::idx_t; - using pointer = value_type *; - using reference = value_type &; + using pointer = value_type*; + using reference = value_type&; using iterator_category = std::random_access_iterator_tag; RAJA_HOST_DEVICE RAJA_INLINE - constexpr iterator(IterableTiler const &itiler_, Index_type block_id_) + constexpr iterator(IterableTiler const& itiler_, Index_type block_id_) : itiler{itiler_}, block_id{block_id_} - { - } + {} RAJA_HOST_DEVICE RAJA_INLINE @@ -142,20 +147,20 @@ struct IterableTiler { } RAJA_HOST_DEVICE - RAJA_INLINE difference_type operator-(const iterator &rhs) const + RAJA_INLINE difference_type operator-(const iterator& rhs) const { return static_cast(block_id) - static_cast(rhs.block_id); } RAJA_HOST_DEVICE - RAJA_INLINE iterator operator-(const difference_type &rhs) const + RAJA_INLINE iterator operator-(const difference_type& rhs) const { return iterator(itiler, block_id - rhs); } RAJA_HOST_DEVICE - RAJA_INLINE iterator operator+(const difference_type &rhs) const + RAJA_INLINE iterator operator+(const difference_type& rhs) const { return iterator(itiler, block_id + rhs >= itiler.num_blocks ? itiler.num_blocks @@ -169,13 +174,13 @@ struct IterableTiler { } RAJA_HOST_DEVICE - RAJA_INLINE bool operator!=(const iterator &rhs) const + RAJA_INLINE bool operator!=(const iterator& rhs) const { return block_id != rhs.block_id; } RAJA_HOST_DEVICE - RAJA_INLINE bool operator<(const iterator &rhs) const + RAJA_INLINE bool operator<(const iterator& rhs) const { return block_id < rhs.block_id; } @@ -183,16 +188,17 @@ struct IterableTiler { RAJA_HOST_DEVICE RAJA_INLINE - IterableTiler(const Iterable &it_, camp::idx_t block_size_) + IterableTiler(const Iterable& it_, camp::idx_t block_size_) : it{it_}, block_size{block_size_} { using std::begin; using std::distance; using std::end; - dist = it.end() - it.begin(); // distance(begin(it), end(it)); + dist = it.end() - it.begin(); // distance(begin(it), end(it)); num_blocks = dist / block_size; // if (dist % block_size) num_blocks += 1; - if (dist - num_blocks * block_size > 0) { + if (dist - num_blocks * block_size > 0) + { num_blocks += 1; } } @@ -222,13 +228,15 @@ template struct StatementExecutor< - statement::Tile, EPol, EnclosedStmts...>, Types> { + statement::Tile, EPol, EnclosedStmts...>, + Types> +{ template - static RAJA_INLINE void exec(Data &data) + static RAJA_INLINE void exec(Data& data) { // Get the segment we are going to tile - auto const &segment = camp::get(data.segment_tuple); + auto const& segment = camp::get(data.segment_tuple); // Get the tiling policies chunk size auto chunk_size = tile_fixed::chunk_size; @@ -238,53 +246,63 @@ struct StatementExecutor< IterableTiler tiled_iterable(segment, chunk_size); // Wrap in case forall_impl needs to thread_privatize - TileWrapper tile_wrapper(data); + TileWrapper tile_wrapper(data); // Loop over tiles, executing enclosed statement list auto r = resources::get_resource::type::get_default(); - forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack()); + forall_impl(r, + EPol{}, + tiled_iterable, + tile_wrapper, + RAJA::expt::get_empty_forall_param_pack()); // Set range back to original values camp::get(data.segment_tuple) = tiled_iterable.it; } }; -template +template struct StatementExecutor< - statement::Tile, EPol, EnclosedStmts...>, Types> { + statement:: + Tile, EPol, EnclosedStmts...>, + Types> +{ template - static RAJA_INLINE void exec(Data &data) + static RAJA_INLINE void exec(Data& data) { // Get the segment we are going to tile - auto const &segment = camp::get(data.segment_tuple); + auto const& segment = camp::get(data.segment_tuple); // Get the tiling policies chunk size auto chunk_size = camp::get(data.param_tuple); - static_assert(camp::concepts::metalib::is_same::value, - "Extracted parameter must be of type TileSize."); + static_assert( + camp::concepts::metalib::is_same::value, + "Extracted parameter must be of type TileSize."); // Create a tile iterator IterableTiler tiled_iterable(segment, chunk_size.size); // Wrap in case forall_impl needs to thread_privatize - TileWrapper tile_wrapper(data); + TileWrapper tile_wrapper(data); // Loop over tiles, executing enclosed statement list auto r = resources::get_resource::type::get_default(); - forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack()); - + forall_impl(r, + EPol{}, + tiled_iterable, + tile_wrapper, + RAJA::expt::get_empty_forall_param_pack()); + // Set range back to original values camp::get(data.segment_tuple) = tiled_iterable.it; } }; -} // end namespace internal -} // end namespace RAJA +} // end namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp index 2653e992c7..eadf8dc2d2 100644 --- a/include/RAJA/pattern/kernel/TileTCount.hpp +++ b/include/RAJA/pattern/kernel/TileTCount.hpp @@ -47,7 +47,8 @@ template -struct TileTCount : public internal::Statement { +struct TileTCount : public internal::Statement +{ static_assert(std::is_base_of::value, "Inappropriate ParamId, ParamId must be of type " "RAJA::Statement::Param< # >"); @@ -56,7 +57,7 @@ struct TileTCount : public internal::Statement { }; -} // end namespace statement +} // end namespace statement namespace internal { @@ -66,9 +67,13 @@ namespace internal * Assigns the tile segment to segment ArgumentId * Assigns the tile index to param ParamId */ -template -struct TileTCountWrapper : public GenericWrapper { +struct TileTCountWrapper : public GenericWrapper +{ using Base = GenericWrapper; using Base::Base; @@ -79,17 +84,16 @@ struct TileTCountWrapper : public GenericWrapper { // Assign the tile's segment to the tuple camp::get(Base::data.segment_tuple) = si.s; - + // Assign the tile's index Base::data.template assign_param(si.i); - + // Execute enclosed statements Base::exec(); } }; - /*! * A generic RAJA::kernel forall_impl executor for statement::TileTCount * @@ -102,14 +106,16 @@ template struct StatementExecutor< - statement::TileTCount, Types> { + statement::TileTCount, + Types> +{ template - static RAJA_INLINE void exec(Data &data) + static RAJA_INLINE void exec(Data& data) { // Get the segment we are going to tile - auto const &segment = camp::get(data.segment_tuple); + auto const& segment = camp::get(data.segment_tuple); // Get the tiling policies chunk size auto chunk_size = TPol::chunk_size; @@ -119,12 +125,16 @@ struct StatementExecutor< IterableTiler tiled_iterable(segment, chunk_size); // Wrap in case forall_impl needs to thread_privatize - TileTCountWrapper tile_wrapper(data); + TileTCountWrapper + tile_wrapper(data); // Loop over tiles, executing enclosed statement list auto r = resources::get_resource::type::get_default(); - forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack()); + forall_impl(r, + EPol{}, + tiled_iterable, + tile_wrapper, + RAJA::expt::get_empty_forall_param_pack()); // Set range back to original values camp::get(data.segment_tuple) = tiled_iterable.it; @@ -132,7 +142,7 @@ struct StatementExecutor< }; -} // end namespace internal -} // end namespace RAJA +} // end namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_HPP */ diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp index 9667a55538..3109b9b452 100644 --- a/include/RAJA/pattern/kernel/internal/LoopData.hpp +++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp @@ -40,29 +40,27 @@ namespace internal { - - - // Universal base of all For wrappers for type traits - struct ForList { - }; - struct ForBase { - }; - struct CollapseBase { - }; - template - struct ForTraitBase : public ForBase { - constexpr static camp::idx_t index_val = ArgumentId; - using index = camp::num; - using index_type = camp::nil; // default to invalid type - using policy_type = Policy; - using type = ForTraitBase; // make camp::value compatible - }; - - +// Universal base of all For wrappers for type traits +struct ForList +{}; +struct ForBase +{}; +struct CollapseBase +{}; +template +struct ForTraitBase : public ForBase +{ + constexpr static camp::idx_t index_val = ArgumentId; + using index = camp::num; + using index_type = camp::nil; // default to invalid type + using policy_type = Policy; + using type = ForTraitBase; // make camp::value compatible +}; template -struct iterable_difftype_getter { +struct iterable_difftype_getter +{ using type = typename std::iterator_traits< typename Iterator::iterator>::difference_type; }; @@ -79,7 +77,8 @@ using difftype_tuple_from_segments = template -struct iterable_value_type_getter { +struct iterable_value_type_getter +{ using type = typename std::iterator_traits::value_type; }; @@ -100,13 +99,12 @@ using index_types_from_segments = value_type_list_from_segments>::type; - - template -struct LoopData { +struct LoopData +{ using Self = LoopData; @@ -138,78 +136,70 @@ struct LoopData { using vector_sizes_t = tuple_of_n::value>; vector_sizes_t vector_sizes; - RAJA_INLINE RAJA_HOST_DEVICE constexpr - LoopData(SegmentTuple const &s, ParamTuple const &p, Resource r, Bodies const &... b) + RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s, + ParamTuple const& p, + Resource r, + Bodies const&... b) : segment_tuple(s), param_tuple(p), res(r), bodies(b...) - { - } - constexpr LoopData(LoopData const &) = default; - constexpr LoopData(LoopData &&) = default; + {} + constexpr LoopData(LoopData const&) = default; + constexpr LoopData(LoopData&&) = default; template - RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i) + RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const& i) { camp::get(offset_tuple) = i; } template - RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i) + RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const& i) { - using param_t = camp::at_v; + using param_t = + camp::at_v; camp::get(param_tuple) = param_t(i); } template - RAJA_HOST_DEVICE RAJA_INLINE - auto get_param() -> - camp::at_v + RAJA_HOST_DEVICE RAJA_INLINE auto get_param() + -> camp::at_v { return camp::get(param_tuple); } - RAJA_HOST_DEVICE RAJA_INLINE - Resource get_resource() - { - return res; - } - - + RAJA_HOST_DEVICE RAJA_INLINE Resource get_resource() { return res; } }; - - template -using segment_diff_type = - typename std::iterator_traits< - typename camp::at_v::iterator>::difference_type; - - +using segment_diff_type = typename std::iterator_traits< + typename camp::at_v::iterator>::difference_type; template -RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) -> - segment_diff_type +RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const& data) + -> segment_diff_type { return camp::get(data.segment_tuple).end() - camp::get(data.segment_tuple).begin(); } - - template -struct GenericWrapper : GenericWrapperBase { +struct GenericWrapper : GenericWrapperBase +{ using data_t = camp::decay; - data_t &data; + data_t& data; RAJA_INLINE - constexpr explicit GenericWrapper(data_t &d) : data{d} {} + constexpr explicit GenericWrapper(data_t& d) : data{d} {} RAJA_INLINE - void exec() { execute_statement_list, Types>(data); } + void exec() + { + execute_statement_list, Types>(data); + } }; @@ -217,28 +207,27 @@ struct GenericWrapper : GenericWrapperBase { * Convenience object used to create a thread-private LoopData object. */ template -struct NestedPrivatizer { +struct NestedPrivatizer +{ using data_t = typename T::data_t; using value_type = camp::decay; - using reference_type = value_type &; + using reference_type = value_type&; data_t privatized_data; value_type privatized_wrapper; RAJA_INLINE - constexpr NestedPrivatizer(const T &o) + constexpr NestedPrivatizer(const T& o) : privatized_data{o.data}, privatized_wrapper(privatized_data) - { - } + {} RAJA_INLINE reference_type get_priv() { return privatized_wrapper; } }; - -} // end namespace internal -} // end namespace RAJA +} // end namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_internal_LoopData_HPP */ diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp index 7f77df4214..3bfb7b5e9f 100644 --- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp +++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp @@ -29,67 +29,77 @@ namespace internal { -template +template struct LoopTypes; -template -struct LoopTypes, camp::list> { +template +struct LoopTypes, camp::list> +{ - using Self = LoopTypes, camp::list>; + using Self = + LoopTypes, camp::list>; static constexpr size_t s_num_segments = sizeof...(SegmentTypes); // This ensures that you don't double-loop over a segment within the same // loop nesting static_assert(s_num_segments == sizeof...(OffsetTypes), - "Number of segments and offsets must match"); + "Number of segments " + "and offsets must " + "match"); using segment_types_t = camp::list; using offset_types_t = camp::list; }; -template -using makeInitialLoopTypes = - LoopTypes::value>, - list_of_n::value>>; +template +using makeInitialLoopTypes = LoopTypes< + list_of_n::value>, + list_of_n::value>>; -template +template struct SetSegmentTypeHelper; -template +template struct SetSegmentTypeHelper> { - using segment_list = typename Types::segment_types_t; - using offset_list = typename Types::offset_types_t; - - static_assert(std::is_same, void>::value, - "Segment was already assigned: Probably looping over same segment in loop nest"); - - using type = LoopTypes< - camp::list>::type...>, - camp::list>::type...>>; - + using segment_list = typename Types::segment_types_t; + using offset_list = typename Types::offset_types_t; + + static_assert(std::is_same, void>::value, + "Segment was already assigned: Probably looping over same " + "segment in loop nest"); + + using type = LoopTypes< + camp::list< + typename std::conditional>::type...>, + camp::list< + typename std::conditional>::type...>>; }; -template -using setSegmentType = - typename SetSegmentTypeHelper>::type; +template +using setSegmentType = typename SetSegmentTypeHelper< + Types, + Segment, + T, + camp::make_idx_seq_t>::type; -template -using setSegmentTypeFromData = - setSegmentType::index_types_t, Segment>>; +template +using setSegmentTypeFromData = setSegmentType< + Types, + Segment, + camp::at_v::index_types_t, Segment>>; -} // end namespace internal -} // end namespace RAJA +} // end namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_internal_LoopTypes_HPP */ diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp index 48ca828a68..8279aac29c 100644 --- a/include/RAJA/pattern/kernel/internal/Statement.hpp +++ b/include/RAJA/pattern/kernel/internal/Statement.hpp @@ -28,11 +28,13 @@ namespace internal { - template -struct Statement { - static_assert(std::is_same::value || sizeof...(EnclosedStmts) > 0, - "Executable statement with no enclosed statements, this is almost certainly a bug"); +struct Statement +{ + static_assert(std::is_same::value || + sizeof...(EnclosedStmts) > 0, + "Executable statement with no enclosed statements, this is " + "almost certainly a bug"); Statement() = delete; using enclosed_statements_t = StatementList; @@ -40,15 +42,12 @@ struct Statement { }; - - template struct StatementExecutor; - -} // end namespace internal -} // end namespace RAJA +} // end namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_internal_HPP */ diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp index 5c0d71afb4..ac88ffe3cf 100644 --- a/include/RAJA/pattern/kernel/internal/StatementList.hpp +++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp @@ -35,8 +35,6 @@ template struct StatementExecutor; - - template using StatementList = camp::list; @@ -47,11 +45,13 @@ struct StatementListExecutor; template -struct StatementListExecutor { + typename StmtList, + typename Types> +struct StatementListExecutor +{ template - static RAJA_INLINE void exec(Data &&data) + static RAJA_INLINE void exec(Data&& data) { // Get the statement we're going to execute @@ -61,8 +61,10 @@ struct StatementListExecutor { StatementExecutor::exec(std::forward(data)); // call our next statement - StatementListExecutor::exec( - std::forward(data)); + StatementListExecutor::exec(std::forward(data)); } }; @@ -72,26 +74,25 @@ struct StatementListExecutor { */ template -struct StatementListExecutor { +struct StatementListExecutor +{ template - static RAJA_INLINE void exec(Data &&) - { - } + static RAJA_INLINE void exec(Data&&) + {} }; template -RAJA_INLINE void execute_statement_list(Data &&data) +RAJA_INLINE void execute_statement_list(Data&& data) { StatementListExecutor<0, camp::size::value, StmtList, Types>::exec( std::forward(data)); } - -} // end namespace internal -} // end namespace RAJA +} // end namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_internal_HPP */ diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp index c750b95986..c8a980bf97 100644 --- a/include/RAJA/pattern/kernel/internal/Template.hpp +++ b/include/RAJA/pattern/kernel/internal/Template.hpp @@ -39,8 +39,8 @@ struct SeqToType template struct ListOfNHelper; -template -struct ListOfNHelper > +template +struct ListOfNHelper> { using type = camp::list::type...>; }; @@ -49,8 +49,8 @@ struct ListOfNHelper > template struct TupleOfNHelper; -template -struct TupleOfNHelper > +template +struct TupleOfNHelper> { using type = camp::tuple::type...>; }; @@ -64,7 +64,8 @@ struct TupleOfNHelper > * */ template -using list_of_n = typename detail::ListOfNHelper>::type; +using list_of_n = + typename detail::ListOfNHelper>::type; /* @@ -74,12 +75,12 @@ using list_of_n = typename detail::ListOfNHelper>::ty * */ template -using tuple_of_n = typename detail::TupleOfNHelper>::type; +using tuple_of_n = + typename detail::TupleOfNHelper>::type; - -} // end namespace internal -} // end namespace RAJA +} // end namespace internal +} // end namespace RAJA #endif /* RAJA_pattern_kernel_internal_Template_HPP */ diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index b78ec0de92..7c14e08236 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -28,7 +28,7 @@ #include "camp/concepts.hpp" #include "camp/tuple.hpp" -//Odd dependecy with atomics is breaking CI builds +// Odd dependecy with atomics is breaking CI builds //#include "RAJA/util/View.hpp" #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL) @@ -41,12 +41,17 @@ namespace RAJA { // GPU or CPU threads available -//strongly type the ExecPlace (guards agaist errors) -enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES }; - -struct null_launch_t { +// strongly type the ExecPlace (guards agaist errors) +enum struct ExecPlace : int +{ + HOST, + DEVICE, + NUM_PLACES }; +struct null_launch_t +{}; + // Support for host, and device template -struct LoopPolicy { +struct LoopPolicy +{ using host_policy_t = HOST_POLICY; #if defined(RAJA_GPU_ACTIVE) using device_policy_t = DEVICE_POLICY; @@ -68,7 +74,8 @@ template -struct LaunchPolicy { +struct LaunchPolicy +{ using host_policy_t = HOST_POLICY; #if defined(RAJA_GPU_ACTIVE) using device_policy_t = DEVICE_POLICY; @@ -76,7 +83,8 @@ struct LaunchPolicy { }; -struct Teams { +struct Teams +{ int value[3]; RAJA_INLINE @@ -96,7 +104,8 @@ struct Teams { constexpr Teams(int i, int j, int k) : value{i, j, k} {} }; -struct Threads { +struct Threads +{ int value[3]; RAJA_INLINE @@ -117,7 +126,8 @@ struct Threads { constexpr Threads(int i, int j, int k) : value{i, j, k} {} }; -struct Lanes { +struct Lanes +{ int value; RAJA_INLINE @@ -129,7 +139,8 @@ struct Lanes { constexpr Lanes(int i) : value(i) {} }; -struct LaunchParams { +struct LaunchParams +{ public: Teams teams; Threads threads; @@ -138,67 +149,71 @@ struct LaunchParams { RAJA_INLINE LaunchParams() = default; - LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0) - : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {}; + LaunchParams(Teams in_teams, + Threads in_threads, + size_t in_shared_mem_size = 0) + : teams(in_teams), + threads(in_threads), + shared_mem_size(in_shared_mem_size){}; private: RAJA_HOST_DEVICE RAJA_INLINE - Teams apply(Teams const &a) { return (teams = a); } + Teams apply(Teams const& a) { return (teams = a); } RAJA_HOST_DEVICE RAJA_INLINE - Threads apply(Threads const &a) { return (threads = a); } + Threads apply(Threads const& a) { return (threads = a); } }; class LaunchContext { public: - - //Bump style allocator used to - //get memory from the pool + // Bump style allocator used to + // get memory from the pool size_t shared_mem_offset; - void *shared_mem_ptr; + void* shared_mem_ptr; #if defined(RAJA_ENABLE_SYCL) - mutable cl::sycl::nd_item<3> *itm; + mutable cl::sycl::nd_item<3>* itm; #endif RAJA_HOST_DEVICE LaunchContext() - : shared_mem_offset(0), shared_mem_ptr(nullptr) - { - } + : shared_mem_offset(0), shared_mem_ptr(nullptr) + {} - //TODO handle alignment - template + // TODO handle alignment + template RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes) { - //Calculate offset in bytes with a char pointer - void* mem_ptr = static_cast(shared_mem_ptr) + shared_mem_offset; + // Calculate offset in bytes with a char pointer + void* mem_ptr = static_cast(shared_mem_ptr) + shared_mem_offset; - shared_mem_offset += bytes*sizeof(T); + shared_mem_offset += bytes * sizeof(T); - //convert to desired type + // convert to desired type return static_cast(mem_ptr); } /* //Odd dependecy with atomics is breaking CI builds - template - RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs) + template RAJA_HOST_DEVICE auto + getSharedMemoryView(size_t bytes, arg idx, args... idxs) { T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; shared_mem_offset += bytes*sizeof(T); - return RAJA::View>(mem_ptr, idx, idxs...); + return RAJA::View>(mem_ptr, idx, + idxs...); } */ RAJA_HOST_DEVICE void releaseSharedMemory() { - //On the cpu/gpu we want to restart the count + // On the cpu/gpu we want to restart the count shared_mem_offset = 0; } @@ -218,19 +233,24 @@ class LaunchContext template struct LaunchExecute; -//Policy based launch with support to new reducers... -template -void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args) +// Policy based launch with support to new reducers... +template +void launch(LaunchParams const& launch_params, + const char* kernel_name, + ReduceParams&&... rest_of_launch_args) { - //Get reducers - auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); + // Get reducers + auto reducers = expt::make_forall_param_pack( + std::forward(rest_of_launch_args)...); - auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); + auto&& launch_body = + expt::get_lambda(std::forward(rest_of_launch_args)...); - //Take the first policy as we assume the second policy is not user defined. - //We rely on the user to pair launch and loop policies correctly. - util::PluginContext context{util::make_context()}; + // Take the first policy as we assume the second policy is not user defined. + // We rely on the user to pair launch and loop policies correctly. + util::PluginContext context{ + util::make_context()}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; @@ -242,29 +262,36 @@ void launch(LaunchParams const &launch_params, const char *kernel_name, ReducePa using launch_t = LaunchExecute; - using Res = typename resources::get_resource::type; + using Res = typename resources::get_resource< + typename LAUNCH_POLICY::host_policy_t>::type; - launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers); + launch_t::exec( + Res::get_default(), launch_params, kernel_name, p_body, reducers); util::callPostLaunchPlugins(context); } -//Duplicate of code above on account that we need to support the case in which a kernel_name is not given -template -void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args) +// Duplicate of code above on account that we need to support the case in which +// a kernel_name is not given +template +void launch(LaunchParams const& launch_params, + ReduceParams&&... rest_of_launch_args) { - const char *kernel_name = nullptr; + const char* kernel_name = nullptr; - //Get reducers - auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); + // Get reducers + auto reducers = expt::make_forall_param_pack( + std::forward(rest_of_launch_args)...); - auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); + auto&& launch_body = + expt::get_lambda(std::forward(rest_of_launch_args)...); - //Take the first policy as we assume the second policy is not user defined. - //We rely on the user to pair launch and loop policies correctly. - util::PluginContext context{util::make_context()}; + // Take the first policy as we assume the second policy is not user defined. + // We rely on the user to pair launch and loop policies correctly. + util::PluginContext context{ + util::make_context()}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; @@ -276,148 +303,208 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_ using launch_t = LaunchExecute; - using Res = typename resources::get_resource::type; + using Res = typename resources::get_resource< + typename LAUNCH_POLICY::host_policy_t>::type; - launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers); + launch_t::exec( + Res::get_default(), launch_params, kernel_name, p_body, reducers); util::callPostLaunchPlugins(context); } //================================================= -//Run time based policy launch +// Run time based policy launch //================================================= template -void launch(ExecPlace place, LaunchParams const ¶ms, BODY const &body) +void launch(ExecPlace place, LaunchParams const& params, BODY const& body) { launch(place, params, nullptr, body); } template -void launch(ExecPlace place, const LaunchParams ¶ms, const char *kernel_name, BODY const &body) +void launch(ExecPlace place, + const LaunchParams& params, + const char* kernel_name, + BODY const& body) { - //Forward to single policy launch API - simplifies testing of plugins - switch (place) { - case ExecPlace::HOST: { - using Res = typename resources::get_resource::type; - launch>(Res::get_default(), params, kernel_name, body); - break; - } + // Forward to single policy launch API - simplifies testing of plugins + switch (place) + { + case ExecPlace::HOST: + { + using Res = typename resources::get_resource< + typename POLICY_LIST::host_policy_t>::type; + launch>( + Res::get_default(), params, kernel_name, body); + break; + } #if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using Res = typename resources::get_resource::type; - launch>(Res::get_default(), params, kernel_name, body); - break; - } + case ExecPlace::DEVICE: + { + using Res = typename resources::get_resource< + typename POLICY_LIST::device_policy_t>::type; + launch>( + Res::get_default(), params, kernel_name, body); + break; + } #endif - default: - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); + default: + RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); } - } -//Run-time API for new reducer interface +// Run-time API for new reducer interface template -void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args) +void launch(ExecPlace place, + const LaunchParams& launch_params, + const char* kernel_name, + ReduceParams&&... rest_of_launch_args) { - //Forward to single policy launch API - simplifies testing of plugins - switch (place) { - case ExecPlace::HOST: { - using Res = typename resources::get_resource::type; - launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); - break; - } + // Forward to single policy launch API - simplifies testing of plugins + switch (place) + { + case ExecPlace::HOST: + { + using Res = typename resources::get_resource< + typename POLICY_LIST::host_policy_t>::type; + launch>( + Res::get_default(), + launch_params, + kernel_name, + std::forward(rest_of_launch_args)...); + break; + } #if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using Res = typename resources::get_resource::type; - launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); - break; - } + case ExecPlace::DEVICE: + { + using Res = typename resources::get_resource< + typename POLICY_LIST::device_policy_t>::type; + launch>( + Res::get_default(), + launch_params, + kernel_name, + std::forward(rest_of_launch_args)...); + break; + } #endif - default: - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); + default: + RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); } - } -//Run-time API for new reducer interface with support of the case without a new kernel name +// Run-time API for new reducer interface with support of the case without a new +// kernel name template -void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args) - //BODY const &body) +void launch(ExecPlace place, + const LaunchParams& launch_params, + ReduceParams&&... rest_of_launch_args) +// BODY const &body) { - const char *kernel_name = nullptr; + const char* kernel_name = nullptr; - //Forward to single policy launch API - simplifies testing of plugins - switch (place) { - case ExecPlace::HOST: { - using Res = typename resources::get_resource::type; - launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); - break; - } + // Forward to single policy launch API - simplifies testing of plugins + switch (place) + { + case ExecPlace::HOST: + { + using Res = typename resources::get_resource< + typename POLICY_LIST::host_policy_t>::type; + launch>( + Res::get_default(), + launch_params, + kernel_name, + std::forward(rest_of_launch_args)...); + break; + } #if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using Res = typename resources::get_resource::type; - launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); - break; - } + case ExecPlace::DEVICE: + { + using Res = typename resources::get_resource< + typename POLICY_LIST::device_policy_t>::type; + launch>( + Res::get_default(), + launch_params, + kernel_name, + std::forward(rest_of_launch_args)...); + break; + } #endif - default: - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); + default: + RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); } - } -// Helper function to retrieve a resource based on the run-time policy - if a device is active -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) -template -RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){ - if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);} - else { return RAJA::resources::Resource(host_res); } +// Helper function to retrieve a resource based on the run-time policy - if a +// device is active +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || \ + defined(RAJA_ENABLE_SYCL) +template +RAJA::resources::Resource +Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device) +{ + if (device == RAJA::ExecPlace::DEVICE) + { + return RAJA::resources::Resource(device_res); + } + else + { + return RAJA::resources::Resource(host_res); + } } #endif -template -RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){ - if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");} +template +RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device) +{ + if (device == RAJA::ExecPlace::DEVICE) + { + RAJA_ABORT_OR_THROW("Device is not enabled"); + } return RAJA::resources::Resource(host_res); } -//Launch API which takes team resource struct and supports new reducers -template +// Launch API which takes team resource struct and supports new reducers +template resources::EventProxy -launch(RAJA::resources::Resource res, LaunchParams const &launch_params, - const char *kernel_name, ReduceParams&&... rest_of_launch_args) +launch(RAJA::resources::Resource res, + LaunchParams const& launch_params, + const char* kernel_name, + ReduceParams&&... rest_of_launch_args) { - //Get reducers - auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); + // Get reducers + auto reducers = expt::make_forall_param_pack( + std::forward(rest_of_launch_args)...); - auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); + auto&& launch_body = + expt::get_lambda(std::forward(rest_of_launch_args)...); ExecPlace place; - if(res.get_platform() == RAJA::Platform::host) { + if (res.get_platform() == RAJA::Platform::host) + { place = RAJA::ExecPlace::HOST; - } else { + } + else + { place = RAJA::ExecPlace::DEVICE; } // - //Configure plugins + // Configure plugins // #if defined(RAJA_GPU_ACTIVE) - util::PluginContext context{place == ExecPlace::HOST ? - util::make_context() : - util::make_context()}; + util::PluginContext context{ + place == ExecPlace::HOST + ? util::make_context() + : util::make_context()}; #else - util::PluginContext context{util::make_context()}; + util::PluginContext context{ + util::make_context()}; #endif util::callPreCapturePlugins(context); @@ -429,24 +516,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params, util::callPreLaunchPlugins(context); - switch (place) { - case ExecPlace::HOST: { - using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); - util::callPostLaunchPlugins(context); - return e_proxy; - } + switch (place) + { + case ExecPlace::HOST: + { + using launch_t = LaunchExecute; + resources::EventProxy e_proxy = + launch_t::exec(res, launch_params, kernel_name, p_body, reducers); + util::callPostLaunchPlugins(context); + return e_proxy; + } #if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); - util::callPostLaunchPlugins(context); - return e_proxy; - } + case ExecPlace::DEVICE: + { + using launch_t = LaunchExecute; + resources::EventProxy e_proxy = + launch_t::exec(res, launch_params, kernel_name, p_body, reducers); + util::callPostLaunchPlugins(context); + return e_proxy; + } #endif - default: { - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); - } + default: + { + RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); + } } RAJA_ABORT_OR_THROW("Unknown launch place"); @@ -456,36 +549,45 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params, } -//Duplicate of API above on account that we need to handle the case that a kernel name is not provided -template +// Duplicate of API above on account that we need to handle the case that a +// kernel name is not provided +template resources::EventProxy -launch(RAJA::resources::Resource res, LaunchParams const &launch_params, +launch(RAJA::resources::Resource res, + LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) { - const char *kernel_name = nullptr; + const char* kernel_name = nullptr; - //Get reducers - auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); + // Get reducers + auto reducers = expt::make_forall_param_pack( + std::forward(rest_of_launch_args)...); - auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); + auto&& launch_body = + expt::get_lambda(std::forward(rest_of_launch_args)...); ExecPlace place; - if(res.get_platform() == RAJA::Platform::host) { + if (res.get_platform() == RAJA::Platform::host) + { place = RAJA::ExecPlace::HOST; - } else { + } + else + { place = RAJA::ExecPlace::DEVICE; } // - //Configure plugins + // Configure plugins // #if defined(RAJA_GPU_ACTIVE) - util::PluginContext context{place == ExecPlace::HOST ? - util::make_context() : - util::make_context()}; + util::PluginContext context{ + place == ExecPlace::HOST + ? util::make_context() + : util::make_context()}; #else - util::PluginContext context{util::make_context()}; + util::PluginContext context{ + util::make_context()}; #endif util::callPreCapturePlugins(context); @@ -497,24 +599,30 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params, util::callPreLaunchPlugins(context); - switch (place) { - case ExecPlace::HOST: { - using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); - util::callPostLaunchPlugins(context); - return e_proxy; - } + switch (place) + { + case ExecPlace::HOST: + { + using launch_t = LaunchExecute; + resources::EventProxy e_proxy = + launch_t::exec(res, launch_params, kernel_name, p_body, reducers); + util::callPostLaunchPlugins(context); + return e_proxy; + } #if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); - util::callPostLaunchPlugins(context); - return e_proxy; - } + case ExecPlace::DEVICE: + { + using launch_t = LaunchExecute; + resources::EventProxy e_proxy = + launch_t::exec(res, launch_params, kernel_name, p_body, reducers); + util::callPostLaunchPlugins(context); + return e_proxy; + } #endif - default: { - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); - } + default: + { + RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); + } } RAJA_ABORT_OR_THROW("Unknown launch place"); @@ -523,7 +631,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params, return resources::EventProxy(res); } -template +template #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) using loop_policy = typename POLICY_LIST::device_policy_t; #else @@ -541,28 +649,23 @@ template -RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx, - SEGMENT const &segment, - BODY const &body) +RAJA_HOST_DEVICE RAJA_INLINE void +loop(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body) { - LoopExecute, SEGMENT>::exec(ctx, - segment, - body); + LoopExecute, SEGMENT>::exec(ctx, segment, body); } template -RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx, - SEGMENT const &segment, - BODY const &body) +RAJA_HOST_DEVICE RAJA_INLINE void +loop_icount(CONTEXT const& ctx, SEGMENT const& segment, BODY const& body) { - LoopICountExecute, SEGMENT>::exec(ctx, - segment, - body); + LoopICountExecute, SEGMENT>::exec( + ctx, segment, body); } namespace expt @@ -573,16 +676,14 @@ template -RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx, - SEGMENT const &segment0, - SEGMENT const &segment1, - BODY const &body) +RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx, + SEGMENT const& segment0, + SEGMENT const& segment1, + BODY const& body) { - LoopExecute, SEGMENT>::exec(ctx, - segment0, - segment1, - body); + LoopExecute, SEGMENT>::exec( + ctx, segment0, segment1, body); } RAJA_SUPPRESS_HD_WARN @@ -590,18 +691,18 @@ template -RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx, - SEGMENT const &segment0, - SEGMENT const &segment1, - SEGMENT const &segment2, - BODY const &body) +RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx, + SEGMENT const& segment0, + SEGMENT const& segment1, + SEGMENT const& segment2, + BODY const& body) { - LoopICountExecute, SEGMENT>::exec(ctx, - segment0, segment1, segment2, body); + LoopICountExecute, SEGMENT>::exec( + ctx, segment0, segment1, segment2, body); } -} //namespace expt +} // namespace expt template struct TileExecute; @@ -614,16 +715,14 @@ template -RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx, +RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx, TILE_T tile_size, - SEGMENT const &segment, - BODY const &body) + SEGMENT const& segment, + BODY const& body) { - TileExecute, SEGMENT>::exec(ctx, - tile_size, - segment, - body); + TileExecute, SEGMENT>::exec( + ctx, tile_size, segment, body); } template -RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx, - TILE_T tile_size, - SEGMENT const &segment, - BODY const &body) +RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx, + TILE_T tile_size, + SEGMENT const& segment, + BODY const& body) { - TileTCountExecute, SEGMENT>::exec(ctx, - tile_size, - segment, - body); + TileTCountExecute, SEGMENT>::exec( + ctx, tile_size, segment, body); } namespace expt @@ -650,20 +747,16 @@ template -RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx, +RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx, TILE_T tile_size0, TILE_T tile_size1, - SEGMENT const &segment0, - SEGMENT const &segment1, - BODY const &body) + SEGMENT const& segment0, + SEGMENT const& segment1, + BODY const& body) { - TileExecute, SEGMENT>::exec(ctx, - tile_size0, - tile_size1, - segment0, - segment1, - body); + TileExecute, SEGMENT>::exec( + ctx, tile_size0, tile_size1, segment0, segment1, body); } template -RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx, - TILE_T tile_size0, - TILE_T tile_size1, - SEGMENT const &segment0, - SEGMENT const &segment1, - BODY const &body) +RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx, + TILE_T tile_size0, + TILE_T tile_size1, + SEGMENT const& segment0, + SEGMENT const& segment1, + BODY const& body) { - TileTCountExecute, SEGMENT>::exec(ctx, - tile_size0, - tile_size1, - segment0, - segment1, - body); + TileTCountExecute, SEGMENT>::exec( + ctx, tile_size0, tile_size1, segment0, segment1, body); } -} //namespace expt +} // namespace expt -} // namespace RAJA +} // namespace RAJA #endif diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp index 3fbe36877c..9d3d9dc975 100644 --- a/include/RAJA/pattern/multi_reduce.hpp +++ b/include/RAJA/pattern/multi_reduce.hpp @@ -156,7 +156,7 @@ struct MultiReduceSum; */ template struct MultiReduceBitOr; - + /*! ****************************************************************************** @@ -171,7 +171,8 @@ struct MultiReduceBitOr; Index_ptr bins = ...; Real_ptr bit_vals = ...; - MultiReduceBitAnd my_bits(num_bins, init_val); + MultiReduceBitAnd my_bits(num_bins, + init_val); forall( ..., [=] (Index_type i) { my_bits[bins[i]] &= (data[i]); @@ -188,7 +189,7 @@ struct MultiReduceBitOr; template struct MultiReduceBitAnd; -} //namespace RAJA +} // namespace RAJA -#endif // closing endif for header file include guard +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp index fb854c8706..b1fdfa3b59 100644 --- a/include/RAJA/pattern/params/forall.hpp +++ b/include/RAJA/pattern/params/forall.hpp @@ -21,346 +21,440 @@ namespace RAJA namespace expt { - // - // - // Forall Parameter Packing type - // - // - struct ParamMultiplexer; - - template - struct ForallParamPack { - - friend struct ParamMultiplexer; - - using Base = camp::tuple; - Base param_tup; - - static constexpr size_t param_tup_sz = camp::tuple_size::value; - using params_seq = camp::make_idx_seq_t< param_tup_sz >; - - private: - - // Init - template - static constexpr void detail_init(EXEC_POL, camp::idx_seq, ForallParamPack& f_params, Args&& ...args) { - CAMP_EXPAND(expt::detail::init( camp::get(f_params.param_tup), std::forward(args)... )); - } - - // Combine - template - RAJA_HOST_DEVICE - static constexpr void detail_combine(EXEC_POL, camp::idx_seq, ForallParamPack& out, const ForallParamPack& in ) { - CAMP_EXPAND(detail::combine( camp::get(out.param_tup), camp::get(in.param_tup))); - } - - template - RAJA_HOST_DEVICE - static constexpr void detail_combine(EXEC_POL, camp::idx_seq, ForallParamPack& f_params ) { - CAMP_EXPAND(detail::combine( camp::get(f_params.param_tup) )); - } - - // Resolve - template - static constexpr void detail_resolve(EXEC_POL, camp::idx_seq, ForallParamPack& f_params, Args&& ...args) { - CAMP_EXPAND(detail::resolve( camp::get(f_params.param_tup), std::forward(args)... )); - } - - // Used to construct the argument TYPES that will be invoked with the lambda. - template - static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; }; - template - static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); }; - template - static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T()); }; - - using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T()); - - //Use the size of param_tup to generate the argument list. - RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); } - RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get(param_tup).get_lambda_arg_tup(); } - template - RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num) { - return camp::tuple_cat_pair( camp::get(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num()) ); - } - - public: - ForallParamPack(){} - - RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num());} - - using lambda_arg_seq = camp::make_idx_seq_t::value>; - - template - ForallParamPack(camp::tuple&& t) : param_tup(std::move(t)) {}; - }; // struct ForallParamPack - - - - //=========================================================================== - // - // - // ParamMultiplexer is how we hook into the individual calls within forall_impl. - // - // - struct ParamMultiplexer { - template> - static void constexpr init( ForallParamPack& f_params, Args&& ...args) { - FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward(args)... ); - } - template> - static void constexpr combine(ForallParamPack& f_params, Args&& ...args){ - FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward(args)... ); - } - template> - static void constexpr resolve( ForallParamPack& f_params, Args&& ...args){ - FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward(args)... ); - } - }; - //=========================================================================== +// +// +// Forall Parameter Packing type +// +// +struct ParamMultiplexer; + +template +struct ForallParamPack +{ + + friend struct ParamMultiplexer; + using Base = camp::tuple; + Base param_tup; + static constexpr size_t param_tup_sz = camp::tuple_size::value; + using params_seq = camp::make_idx_seq_t; + +private: + // Init + template + static constexpr void detail_init(EXEC_POL, + camp::idx_seq, + ForallParamPack& f_params, + Args&&... args) + { + CAMP_EXPAND(expt::detail::init(camp::get(f_params.param_tup), + std::forward(args)...)); + } + + // Combine + template + RAJA_HOST_DEVICE static constexpr void + detail_combine(EXEC_POL, + camp::idx_seq, + ForallParamPack& out, + const ForallParamPack& in) + { + CAMP_EXPAND(detail::combine(camp::get(out.param_tup), + camp::get(in.param_tup))); + } - //=========================================================================== - // - // - // ForallParamPack generators. - // - // - RAJA_INLINE static auto get_empty_forall_param_pack(){ - static ForallParamPack<> p; - return p; + template + RAJA_HOST_DEVICE static constexpr void + detail_combine(EXEC_POL, camp::idx_seq, ForallParamPack& f_params) + { + CAMP_EXPAND(detail::combine(camp::get(f_params.param_tup))); } - namespace detail { - // all_true trick to perform variadic expansion in static asserts. - // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template - template struct bool_pack; - template - using all_true = std::is_same, bool_pack>; + // Resolve + template + static constexpr void detail_resolve(EXEC_POL, + camp::idx_seq, + ForallParamPack& f_params, + Args&&... args) + { + CAMP_EXPAND(detail::resolve(camp::get(f_params.param_tup), + std::forward(args)...)); + } - template - using check_types_derive_base = all_true::value...>; - } // namespace detail + // Used to construct the argument TYPES that will be invoked with the lambda. + template + static constexpr auto LAMBDA_ARG_TUP_T() + { + return camp::tuple<>{}; + }; + template + static constexpr auto LAMBDA_ARG_TUP_T() + { + return typename First::ARG_TUP_T(); + }; + template + static constexpr auto LAMBDA_ARG_TUP_T() + { + return camp::tuple_cat_pair(typename First::ARG_TUP_T(), + LAMBDA_ARG_TUP_T()); + }; + using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T()); - template - constexpr auto make_forall_param_pack_from_tuple(camp::tuple&& tuple) { - static_assert(detail::check_types_derive_base...>::value, - "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ; - return ForallParamPack...>(std::move(tuple)); + // Use the size of param_tup to generate the argument list. + RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) + { + return camp::make_tuple(); + } + RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) + { + return camp::get(param_tup).get_lambda_arg_tup(); + } + template + RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num) + { + return camp::tuple_cat_pair( + camp::get(param_tup).get_lambda_arg_tup(), + LAMBDA_ARG_TUP_V(camp::num())); } - +public: + ForallParamPack() {} - namespace detail { - // Maybe we should do a lot of these with structs... - template - constexpr auto tuple_from_seq (const camp::idx_seq&, TupleType&& tuple){ - return camp::forward_as_tuple( camp::get< Seq >(std::forward(tuple))... ); - }; + RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() + { + return LAMBDA_ARG_TUP_V(camp::num()); + } - template - constexpr auto strip_last_elem(camp::tuple&& tuple){ - return tuple_from_seq(camp::make_idx_seq_t{},std::move(tuple)); - }; - } // namespace detail + using lambda_arg_seq = + camp::make_idx_seq_t::value>; + template + ForallParamPack(camp::tuple&& t) : param_tup(std::move(t)){}; +}; // struct ForallParamPack - // Make a tuple of the param pack except the final element... - template - constexpr auto make_forall_param_pack(Args&&... args){ - // We assume the last element of the pack is the lambda so we need to strip it from the list. - auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward(args)...) ); - return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple)); + +//=========================================================================== +// +// +// ParamMultiplexer is how we hook into the individual calls within forall_impl. +// +// +struct ParamMultiplexer +{ + template > + static void constexpr init(ForallParamPack& f_params, + Args&&... args) + { + FP::detail_init(EXEC_POL(), + typename FP::params_seq(), + f_params, + std::forward(args)...); + } + template > + static void constexpr combine(ForallParamPack& f_params, + Args&&... args) + { + FP::detail_combine(EXEC_POL(), + typename FP::params_seq(), + f_params, + std::forward(args)...); + } + template > + static void constexpr resolve(ForallParamPack& f_params, + Args&&... args) + { + FP::detail_resolve(EXEC_POL(), + typename FP::params_seq(), + f_params, + std::forward(args)...); } - //=========================================================================== - - - - //=========================================================================== - // - // - // Callable should be the last argument in the param pack, just extract it... - // - // - template - constexpr auto&& get_lambda(Args&&... args){ - return camp::get( camp::forward_as_tuple(std::forward(args)...) ); - } - //=========================================================================== - - - - //=========================================================================== - // - // - // Checking expected argument list against the assumed lambda. - // - // - namespace detail { - - // - // - // Lambda traits Utilities - // - // - template - struct lambda_traits; - - template - struct lambda_traits - { // non-const specialization - using arg_type = First; - }; - template - struct lambda_traits - { // const specialization - using arg_type = First; - }; - - template - typename lambda_traits::arg_type* lambda_arg_helper(T); - - - // - // - // List manipulation Utilities - // - // - template - constexpr auto list_remove_pointer(const camp::list&){ - return camp::list::type>...>{}; - } - - template - constexpr auto list_add_lvalue_ref(const camp::list&){ - return camp::list::type...>{}; - } - - template - constexpr auto tuple_to_list(const camp::tuple&) { - return camp::list{}; - } - - // TODO : Change to std::is_invocable at c++17 - template - struct is_invocable : - std::is_constructible< - std::function, - std::reference_wrapper::type> - >{}; - - template - using void_t = void; - - template - struct has_empty_op : std::false_type{}; - - template - struct has_empty_op)>> : std::true_type{}; - - template - struct get_lambda_index_type { - typedef typename std::remove_pointer< - decltype(lambda_arg_helper( - &camp::decay::operator()) - ) - >::type type; - }; - - // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args. - template - constexpr concepts::enable_if>> check_invocable(LAMBDA&&, const camp::list&) {} - - template - constexpr concepts::enable_if> check_invocable(LAMBDA&&, const camp::list&) { +}; +//=========================================================================== + + +//=========================================================================== +// +// +// ForallParamPack generators. +// +// +RAJA_INLINE static auto get_empty_forall_param_pack() +{ + static ForallParamPack<> p; + return p; +} + +namespace detail +{ +// all_true trick to perform variadic expansion in static asserts. +// https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template +template +struct bool_pack; +template +using all_true = std::is_same, bool_pack>; + +template +using check_types_derive_base = + all_true::value...>; +} // namespace detail + + +template +constexpr auto make_forall_param_pack_from_tuple(camp::tuple&& tuple) +{ + static_assert(detail::check_types_derive_base...>::value, + "Forall optional arguments do not derive ForallParamBase. " + "Please see Reducer, ReducerLoc and KernelName for examples."); + return ForallParamPack...>(std::move(tuple)); +} + + +namespace detail +{ +// Maybe we should do a lot of these with structs... +template +constexpr auto tuple_from_seq(const camp::idx_seq&, TupleType&& tuple) +{ + return camp::forward_as_tuple( + camp::get(std::forward(tuple))...); +}; + +template +constexpr auto strip_last_elem(camp::tuple&& tuple) +{ + return tuple_from_seq(camp::make_idx_seq_t{}, + std::move(tuple)); +}; +} // namespace detail + + +// Make a tuple of the param pack except the final element... +template +constexpr auto make_forall_param_pack(Args&&... args) +{ + // We assume the last element of the pack is the lambda so we need to strip it + // from the list. + auto stripped_arg_tuple = detail::strip_last_elem( + camp::forward_as_tuple(std::forward(args)...)); + return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple)); +} +//=========================================================================== + + +//=========================================================================== +// +// +// Callable should be the last argument in the param pack, just extract it... +// +// +template +constexpr auto&& get_lambda(Args&&... args) +{ + return camp::get( + camp::forward_as_tuple(std::forward(args)...)); +} +//=========================================================================== + + +//=========================================================================== +// +// +// Checking expected argument list against the assumed lambda. +// +// +namespace detail +{ + +// +// +// Lambda traits Utilities +// +// +template +struct lambda_traits; + +template +struct lambda_traits +{ // non-const specialization + using arg_type = First; +}; +template +struct lambda_traits +{ // const specialization + using arg_type = First; +}; + +template +typename lambda_traits::arg_type* lambda_arg_helper(T); + + +// +// +// List manipulation Utilities +// +// +template +constexpr auto list_remove_pointer(const camp::list&) +{ + return camp::list::type>...>{}; +} + +template +constexpr auto list_add_lvalue_ref(const camp::list&) +{ + return camp::list::type...>{}; +} + +template +constexpr auto tuple_to_list(const camp::tuple&) +{ + return camp::list{}; +} + +// TODO : Change to std::is_invocable at c++17 +template +struct is_invocable + : std::is_constructible< + std::function, + std::reference_wrapper::type>> +{}; + +template +using void_t = void; + +template +struct has_empty_op : std::false_type +{}; + +template +struct has_empty_op)>> + : std::true_type +{}; + +template +struct get_lambda_index_type +{ + typedef typename std::remove_pointer::operator()))>::type type; +}; + +// If LAMBDA::operator() is not available this probably isn't a generic lambda +// and we can't extract and check args. +template +constexpr concepts::enable_if>> +check_invocable(LAMBDA&&, const camp::list&) +{} + +template +constexpr concepts::enable_if> +check_invocable(LAMBDA&&, const camp::list&) +{ #if !defined(RAJA_ENABLE_HIP) - static_assert(is_invocable::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS."); + static_assert(is_invocable::type, + EXPECTED_ARGS...>::value, + "LAMBDA Not invocable w/ EXPECTED_ARGS."); #endif - } - - } // namespace detail +} +} // namespace detail - template - constexpr - void - check_forall_optional_args(Lambda&& l, ForallParams& fpp) { - using expected_arg_type_list = decltype( detail::list_add_lvalue_ref( - detail::list_remove_pointer( - detail::tuple_to_list( - fpp.lambda_args() - ) - ) - )); +template +constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp) +{ - detail::check_invocable(std::forward(l), expected_arg_type_list{}); - } - //=========================================================================== - + using expected_arg_type_list = decltype(detail::list_add_lvalue_ref( + detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args())))); + detail::check_invocable(std::forward(l), expected_arg_type_list{}); +} +//=========================================================================== - //=========================================================================== - // - // - // Type trailts for SFINAE work. - // - // - namespace type_traits - { - template struct is_ForallParamPack : std::false_type {}; - template struct is_ForallParamPack> : std::true_type {}; - template struct is_ForallParamPack_empty : std::true_type {}; - template struct is_ForallParamPack_empty> : std::false_type {}; - template <> struct is_ForallParamPack_empty> : std::true_type {}; - } - //=========================================================================== - - - - //=========================================================================== - // - // - // Invoke Forall with Params. - // - // - namespace detail { - template - RAJA_HOST_DEVICE - constexpr - auto get_lambda_args(FP& fpp) - -> decltype( *camp::get( fpp.lambda_args() ) ) { - return ( *camp::get( fpp.lambda_args() ) ); - } - - CAMP_SUPPRESS_HD_WARN - template - RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params, - Fn&& f, - camp::idx_seq, - Ts&&... extra) - { - return f(std::forward(extra...), ( get_lambda_args(params) )...); - } - } // namespace detail - - //CAMP_SUPPRESS_HD_WARN - template - RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra) - { - return detail::invoke_with_order( - camp::forward(params), - camp::forward(f), - typename camp::decay::lambda_arg_seq(), - camp::forward(extra)...); - } - //=========================================================================== +//=========================================================================== +// +// +// Type trailts for SFINAE work. +// +// +namespace type_traits +{ +template +struct is_ForallParamPack : std::false_type +{}; +template +struct is_ForallParamPack> : std::true_type +{}; + +template +struct is_ForallParamPack_empty : std::true_type +{}; +template +struct is_ForallParamPack_empty> + : std::false_type +{}; +template <> +struct is_ForallParamPack_empty> : std::true_type +{}; +} // namespace type_traits +//=========================================================================== + + +//=========================================================================== +// +// +// Invoke Forall with Params. +// +// +namespace detail +{ +template +RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp) + -> decltype(*camp::get(fpp.lambda_args())) +{ + return (*camp::get(fpp.lambda_args())); +} + +CAMP_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params, + Fn&& f, + camp::idx_seq, + Ts&&... extra) +{ + return f(std::forward(extra...), + (get_lambda_args(params))...); +} +} // namespace detail + +// CAMP_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE constexpr auto +invoke_body(Params&& params, Fn&& f, Ts&&... extra) +{ + return detail::invoke_with_order( + camp::forward(params), + camp::forward(f), + typename camp::decay::lambda_arg_seq(), + camp::forward(extra)...); +} +//=========================================================================== } // namespace expt } // namespace RAJA diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp index e768d8dd59..f3a517fbac 100644 --- a/include/RAJA/pattern/params/kernel_name.hpp +++ b/include/RAJA/pattern/params/kernel_name.hpp @@ -10,23 +10,20 @@ namespace expt namespace detail { - struct KernelName : public ForallParamBase { - RAJA_HOST_DEVICE KernelName() {} - KernelName(const char* name_in) : name(name_in) {} - const char* name; - }; +struct KernelName : public ForallParamBase +{ + RAJA_HOST_DEVICE KernelName() {} + KernelName(const char* name_in) : name(name_in) {} + const char* name; +}; } // namespace detail -inline auto KernelName(const char * n) -{ - return detail::KernelName(n); -} +inline auto KernelName(const char* n) { return detail::KernelName(n); } } // namespace expt } // namespace RAJA - #endif // KERNEL_NAME_HPP diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp index 51e96260f8..78b14f907a 100644 --- a/include/RAJA/pattern/params/params_base.hpp +++ b/include/RAJA/pattern/params/params_base.hpp @@ -9,16 +9,17 @@ namespace expt namespace detail { - struct ForallParamBase { - - // Some of this can be made virtual in c++20, for now must be defined in each child class - // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.) - using ARG_TUP_T = camp::tuple<>; - using ARG_LIST_T = typename ARG_TUP_T::TList; - RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); } - static constexpr size_t num_lambda_args = camp::tuple_size::value; - - }; +struct ForallParamBase +{ + + // Some of this can be made virtual in c++20, for now must be defined in each + // child class if any arguments to the forall lambda are needed (e.g. + // KernelName is excluded.) + using ARG_TUP_T = camp::tuple<>; + using ARG_LIST_T = typename ARG_TUP_T::TList; + RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); } + static constexpr size_t num_lambda_args = camp::tuple_size::value; +}; } // namespace detail diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp index 05103c7ad4..e6c4c737a1 100644 --- a/include/RAJA/pattern/params/reducer.hpp +++ b/include/RAJA/pattern/params/reducer.hpp @@ -18,8 +18,9 @@ namespace RAJA namespace expt { -template -struct ValLoc { +template +struct ValLoc +{ using index_type = RAJA::Index_type; using value_type = T; @@ -27,14 +28,28 @@ struct ValLoc { RAJA_HOST_DEVICE ValLoc(value_type v) : val(v) {} RAJA_HOST_DEVICE ValLoc(value_type v, RAJA::Index_type l) : val(v), loc(l) {} - RAJA_HOST_DEVICE void min(value_type v, index_type l) { if (v < val) { val = v; loc = l; } } - RAJA_HOST_DEVICE void max(value_type v, index_type l) { if (v > val) { val = v; loc = l; } } + RAJA_HOST_DEVICE void min(value_type v, index_type l) + { + if (v < val) + { + val = v; + loc = l; + } + } + RAJA_HOST_DEVICE void max(value_type v, index_type l) + { + if (v > val) + { + val = v; + loc = l; + } + } bool constexpr operator<(const ValLoc& rhs) const { return val < rhs.val; } bool constexpr operator>(const ValLoc& rhs) const { return val > rhs.val; } - value_type getVal() {return val;} - RAJA::Index_type getLoc() {return loc;} + value_type getVal() { return val; } + RAJA::Index_type getLoc() { return loc; } private: value_type val; @@ -47,7 +62,8 @@ namespace operators { template -struct limits> { +struct limits> +{ RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc min() { return RAJA::expt::ValLoc(RAJA::operators::limits::min()); @@ -71,75 +87,81 @@ namespace detail { #if defined(RAJA_CUDA_ACTIVE) - using device_mem_pool_t = RAJA::cuda::device_mempool_type; +using device_mem_pool_t = RAJA::cuda::device_mempool_type; #elif defined(RAJA_HIP_ACTIVE) - using device_mem_pool_t = RAJA::hip::device_mempool_type; +using device_mem_pool_t = RAJA::hip::device_mempool_type; #elif defined(RAJA_SYCL_ACTIVE) - using device_mem_pool_t = RAJA::sycl::device_mempool_type; +using device_mem_pool_t = RAJA::sycl::device_mempool_type; #endif - // - // - // Basic Reducer - // - // - template - struct Reducer : public ForallParamBase { - using op = Op; - using value_type = T; - - RAJA_HOST_DEVICE Reducer() {} - Reducer(value_type *target_in) : target(target_in), val(op::identity()) {} - - value_type *target = nullptr; - value_type val = op::identity(); - -#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE) - // Device related attributes. - value_type * devicetarget = nullptr; - RAJA::detail::SoAPtr device_mem; - unsigned int * device_count = nullptr; +// +// +// Basic Reducer +// +// +template +struct Reducer : public ForallParamBase +{ + using op = Op; + using value_type = T; + + RAJA_HOST_DEVICE Reducer() {} + Reducer(value_type* target_in) : target(target_in), val(op::identity()) {} + + value_type* target = nullptr; + value_type val = op::identity(); + +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || \ + defined(RAJA_SYCL_ACTIVE) + // Device related attributes. + value_type* devicetarget = nullptr; + RAJA::detail::SoAPtr device_mem; + unsigned int* device_count = nullptr; #endif - using ARG_TUP_T = camp::tuple; - RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&val); } + using ARG_TUP_T = camp::tuple; + RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() + { + return camp::make_tuple(&val); + } - using ARG_LIST_T = typename ARG_TUP_T::TList; - static constexpr size_t num_lambda_args = camp::tuple_size::value ; - }; + using ARG_LIST_T = typename ARG_TUP_T::TList; + static constexpr size_t num_lambda_args = camp::tuple_size::value; +}; } // namespace detail template