diff --git a/.clang-format b/.clang-format index b6fa54b233..b50c1facfb 100644 --- a/.clang-format +++ b/.clang-format @@ -31,7 +31,7 @@ BraceWrapping: AfterExternBlock: false BeforeCatch: true BeforeElse: true - BeforeLambdaBody: true + BeforeLambdaBody: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false @@ -40,12 +40,13 @@ BraceWrapping: # Pointer alignment DerivePointerAlignment: false PointerAlignment: Left + +# Single line config AllowShortIfStatementsOnASingleLine : true AllowShortFunctionsOnASingleLine : true AllowShortLoopsOnASingleLine : false AllowAllArgumentsOnNextLine : true AllowAllParametersOfDeclarationOnNextLine : false -AlignTrailingComments : true BinPackArguments : true BinPackParameters : false ConstructorInitializerAllOnOneLineOrOnePerLine : true diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake index 8a19001cc7..11c4661cc1 100644 --- a/cmake/RAJAMacros.cmake +++ b/cmake/RAJAMacros.cmake @@ -209,7 +209,7 @@ endmacro(raja_add_benchmark) ## raja_add_code_checks() ## ## Adds code checks for all source files recursively in the RAJA repository. -## +## ## This creates the following parent build targets: ## check - Runs a non file changing style check and CppCheck ## style - In-place code formatting @@ -233,7 +233,7 @@ macro(raja_add_code_checks) if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") # Create file globbing expressions that only include directories that contain source # TODO(bowen) Add examples, exercises and benchmark to the list below - set(_base_dirs "RAJA" "examples" "exercises" "benchmark" "include" "src" "test") + set(_base_dirs "RAJA" "benchmark" "include" "src" "test") set(_ext_expressions "*.cpp" "*.hpp" "*.inl" "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh") @@ -248,15 +248,6 @@ macro(raja_add_code_checks) set(_sources) file(GLOB_RECURSE _sources ${_glob_expressions}) - # Filter out exclusions - #set(_exclude_expressions - # "${PROJECT_SOURCE_DIR}/axom/sidre/examples/lulesh2/*" - # "${PROJECT_SOURCE_DIR}/axom/slam/examples/lulesh2.0.3/*" - # "${PROJECT_SOURCE_DIR}/axom/slam/examples/tinyHydro/*") - #foreach(_exp ${_exclude_expressions}) - # list(FILTER _sources EXCLUDE REGEX ${_exp}) - #endforeach() -# blt_add_code_checks(PREFIX RAJA SOURCES ${_sources} CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format @@ -265,7 +256,7 @@ macro(raja_add_code_checks) # Set FOLDER property for code check targets foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style) set(_tgt ${arg_PREFIX}_${_suffix}) - if(TARGET ${_tgt}) + if(TARGET ${_tgt}) set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks") endif() endforeach() diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 8bb722a797..453dc75a1c 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -29,7 +29,7 @@ #include "camp/tuple.hpp" // Odd dependecy with atomics is breaking CI builds -//#include "RAJA/util/View.hpp" +// #include "RAJA/util/View.hpp" #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL) #define RAJA_TEAM_SHARED __shared__ diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp index 8f152d92ce..c384465a15 100644 --- a/include/RAJA/pattern/tensor/TensorIndex.hpp +++ b/include/RAJA/pattern/tensor/TensorIndex.hpp @@ -29,196 +29,190 @@ namespace expt { - template INDEX_VALUE, strip_index_type_t LENGTH_VALUE> - struct StaticTensorIndexInner; - - template - struct StaticTensorIndex; - - - template - class TensorIndex { - public: - using self_type = TensorIndex; - using value_type = strip_index_type_t; - using index_type = IDX; - using tensor_type = TENSOR_TYPE; - - RAJA_INLINE - RAJA_HOST_DEVICE - static - constexpr - self_type all(){ - return self_type(index_type(-1), value_type(-1)); - } - - RAJA_INLINE - RAJA_HOST_DEVICE - static - constexpr - StaticTensorIndex> static_all(){ - return StaticTensorIndex>(); - } - - RAJA_INLINE - RAJA_HOST_DEVICE - static - constexpr - self_type range(index_type begin, index_type end){ - return self_type(begin, value_type(stripIndexType(end-begin))); - } - - template - RAJA_INLINE - RAJA_HOST_DEVICE - static - constexpr - StaticTensorIndex> static_range(){ - return StaticTensorIndex>(); - } - - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - TensorIndex() : m_index(index_type(0)), m_length(0) {} - - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - TensorIndex(RAJA::TypedRangeSegment const &seg) : - m_index(*seg.begin()), m_length(seg.size()) - {} - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {} - - template - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - TensorIndex(TensorIndex const &c) : m_index(*c), m_length(c.size()) {} - - - template - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - TensorIndex(StaticTensorIndex> const RAJA_UNUSED_ARG(&c)) - : m_index(IDX_VAL) - , m_length(LEN_VAL) - {} - - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - index_type const &operator*() const { - return m_index; - } - - // used in strip_by_value as a static cast - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - explicit operator index_type() const { - // return does not matter, but suppresses no-return warnings - return m_index; - } - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - index_type begin() const { - return m_index; - } - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - value_type size() const { - return m_length; - } - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr - value_type dim() const { - return DIM; - } - - private: - index_type m_index; - value_type m_length; - }; - - - template INDEX_VALUE, strip_index_type_t LENGTH_VALUE> - struct StaticTensorIndex> { - - using base_type = TensorIndex; - using value_type = strip_index_type_t; - using index_type = IDX; - using tensor_type = TENSOR_TYPE; - - static const index_type s_index = INDEX_VALUE; - static const index_type s_length = LENGTH_VALUE; - - RAJA_INLINE - RAJA_HOST_DEVICE - constexpr operator base_type() { - return base_type(s_index,s_length); - } - - }; - - - - /*! - * Index that specifies the starting element index of a Vector - */ - template - using VectorIndex = TensorIndex; - - /*! - * Index that specifies the starting Row index of a matrix - */ - template - using RowIndex = TensorIndex; - - /*! - * Index that specifies the starting Column index of a matrix - */ - template - using ColIndex = TensorIndex; - - - /*! - * Converts a Row index to a Column index - */ - template +template INDEX_VALUE, + strip_index_type_t LENGTH_VALUE> +struct StaticTensorIndexInner; + +template +struct StaticTensorIndex; + + +template +class TensorIndex +{ +public: + using self_type = TensorIndex; + using value_type = strip_index_type_t; + using index_type = IDX; + using tensor_type = TENSOR_TYPE; + + RAJA_INLINE + RAJA_HOST_DEVICE + static constexpr self_type all() + { + return self_type(index_type(-1), value_type(-1)); + } + + RAJA_INLINE RAJA_HOST_DEVICE + static constexpr StaticTensorIndex> + static_all() + { + return StaticTensorIndex>(); + } + RAJA_INLINE - constexpr - ColIndex toColIndex(RowIndex const &r){ - return ColIndex(*r, r.size()); + RAJA_HOST_DEVICE + static constexpr self_type range(index_type begin, index_type end) + { + return self_type(begin, value_type(stripIndexType(end - begin))); } - /*! - * Converts a Column index to a Row index - */ - template + template + RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex< + StaticTensorIndexInner> + static_range() + { + return StaticTensorIndex< + StaticTensorIndexInner>(); + } + + + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {} + + + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr TensorIndex(RAJA::TypedRangeSegment const& seg) + : m_index(*seg.begin()), m_length(seg.size()) + {} + + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr TensorIndex(index_type value, value_type length) + : m_index(value), m_length(length) + {} + + template + RAJA_INLINE + RAJA_HOST_DEVICE constexpr TensorIndex(TensorIndex const& c) + : m_index(*c), m_length(c.size()) + {} + + + template + RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex( + StaticTensorIndex< + StaticTensorIndexInner> const + RAJA_UNUSED_ARG(&c)) + : m_index(IDX_VAL), m_length(LEN_VAL) + {} + + + RAJA_INLINE RAJA_HOST_DEVICE + constexpr index_type const& operator*() const { return m_index; } + + // used in strip_by_value as a static cast RAJA_INLINE - constexpr - RowIndex toRowIndex(ColIndex const &c){ - return RowIndex(*c, c.size()); + RAJA_HOST_DEVICE + constexpr explicit operator index_type() const + { + // return does not matter, but suppresses no-return warnings + return m_index; } -} // namespace expt + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr index_type begin() const { return m_index; } + + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr value_type size() const { return m_length; } + + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr value_type dim() const { return DIM; } + +private: + index_type m_index; + value_type m_length; +}; + + +template INDEX_VALUE, + strip_index_type_t LENGTH_VALUE> +struct StaticTensorIndex< + StaticTensorIndexInner> +{ + + using base_type = TensorIndex; + using value_type = strip_index_type_t; + using index_type = IDX; + using tensor_type = TENSOR_TYPE; + + static const index_type s_index = INDEX_VALUE; + static const index_type s_length = LENGTH_VALUE; + + RAJA_INLINE + RAJA_HOST_DEVICE + constexpr operator base_type() { return base_type(s_index, s_length); } +}; + + +/*! + * Index that specifies the starting element index of a Vector + */ +template +using VectorIndex = TensorIndex; + +/*! + * Index that specifies the starting Row index of a matrix + */ +template +using RowIndex = TensorIndex; + +/*! + * Index that specifies the starting Column index of a matrix + */ +template +using ColIndex = TensorIndex; + + +/*! + * Converts a Row index to a Column index + */ +template +RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex +toColIndex(RowIndex const& r) +{ + return ColIndex(*r, r.size()); +} + +/*! + * Converts a Column index to a Row index + */ +template +RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex +toRowIndex(ColIndex const& c) +{ + return RowIndex(*c, c.size()); +} + +} // namespace expt } // namespace RAJA #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp" diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp index 0c57f20067..50ae0933c0 100644 --- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp +++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp @@ -28,7 +28,7 @@ #include "RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp" -//#define RAJA_DEBUG_PRINT_ET_AST +// #define RAJA_DEBUG_PRINT_ET_AST namespace RAJA { diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp index c89f887ca5..6ea5d09aa9 100644 --- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp +++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp @@ -1091,13 +1091,13 @@ struct MultiplyOperator< * Evaluate operands and perform element-wise multiply */ template - RAJA_INLINE RAJA_HOST_DEVICE static block_literal - multiply(TILE_TYPE const& tile, - LEFT_OPERAND_TYPE const&, - RIGHT_OPERAND_TYPE const&) //-> - /// decltype(TensorMultiply(left.eval(tile), - /// right.eval(tile))) + RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply( + TILE_TYPE const& tile, + LEFT_OPERAND_TYPE const&, + RIGHT_OPERAND_TYPE const&) //-> + /// decltype(TensorMultiply(left.eval(tile), + /// right.eval(tile))) { /* @@ -1126,15 +1126,16 @@ struct MultiplyOperator< } template - RAJA_INLINE RAJA_HOST_DEVICE static block_literal - multiply_add(TILE_TYPE const& tile, - LEFT_OPERAND_TYPE const& left, - RIGHT_OPERAND_TYPE const& right, - ADD_TYPE const& add) //-> - // decltype(TensorMultiplyAdd(left.eval(tile), - // right.eval(tile), add.eval(tile))) + RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add( + TILE_TYPE const& tile, + LEFT_OPERAND_TYPE const& left, + RIGHT_OPERAND_TYPE const& right, + ADD_TYPE const& + add) //-> + // decltype(TensorMultiplyAdd(left.eval(tile), + // right.eval(tile), add.eval(tile))) { /* * First pass: we want to return a BlockLiteral ET node with the diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp index 2b87f1d34d..3134421735 100644 --- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp +++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp @@ -24,7 +24,7 @@ #include "RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp" #include "RAJA/util/BitMask.hpp" -//#define DEBUG_MATRIX_LOAD_STORE +// #define DEBUG_MATRIX_LOAD_STORE namespace RAJA diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp index e43bd71386..742aaa25b8 100644 --- a/include/RAJA/policy/atomic_builtin.hpp +++ b/include/RAJA/policy/atomic_builtin.hpp @@ -22,7 +22,8 @@ #include -#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER)) +#if defined(RAJA_COMPILER_MSVC) || \ + ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER)) #include #endif @@ -41,14 +42,16 @@ namespace RAJA //! Atomic policy that uses the compilers builtin __atomic_XXX routines -struct builtin_atomic { -}; +struct builtin_atomic +{}; -namespace detail { +namespace detail +{ -#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER)) +#if defined(RAJA_COMPILER_MSVC) || \ + ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER)) /*! @@ -56,12 +59,11 @@ namespace detail { * using an intrinsic */ template -struct builtin_useIntrinsic { +struct builtin_useIntrinsic +{ static constexpr bool value = - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value; + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value; }; @@ -70,18 +72,18 @@ struct builtin_useIntrinsic { * by reinterpreting inputs to types that intrinsics support */ template -struct builtin_useReinterpret { +struct builtin_useReinterpret +{ static constexpr bool value = - !builtin_useIntrinsic::value && - (sizeof(T) == 1 || - sizeof(T) == 2 || - sizeof(T) == 4 || - sizeof(T) == 8); - - using type = - std::conditional_t>>; + !builtin_useIntrinsic::value && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); + + using type = std::conditional_t< + sizeof(T) == 1, + char, + std::conditional_t>>; }; @@ -90,10 +92,11 @@ struct builtin_useReinterpret { * using a compare and swap loop */ template -struct builtin_useCAS { +struct builtin_useCAS +{ static constexpr bool value = - !builtin_useIntrinsic::value && - (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); + !builtin_useIntrinsic::value && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); }; @@ -105,24 +108,24 @@ struct builtin_useCAS { /*! * Atomic or using intrinsics */ -RAJA_INLINE char builtin_atomicOr(char *acc, char value) +RAJA_INLINE char builtin_atomicOr(char* acc, char value) { return _InterlockedOr8(acc, value); } -RAJA_INLINE short builtin_atomicOr(short *acc, short value) +RAJA_INLINE short builtin_atomicOr(short* acc, short value) { return _InterlockedOr16(acc, value); } -RAJA_INLINE long builtin_atomicOr(long *acc, long value) +RAJA_INLINE long builtin_atomicOr(long* acc, long value) { return _InterlockedOr(acc, value); } #if defined(_WIN64) -RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value) +RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value) { return _InterlockedOr64(acc, value); } @@ -134,7 +137,7 @@ RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value) */ template ::value, bool> = true> -RAJA_INLINE T builtin_atomicLoad(T *acc) +RAJA_INLINE T builtin_atomicLoad(T* acc) { return builtin_atomicOr(acc, static_cast(0)); } @@ -143,24 +146,24 @@ RAJA_INLINE T builtin_atomicLoad(T *acc) /*! * Atomic exchange using intrinsics */ -RAJA_INLINE char builtin_atomicExchange(char *acc, char value) +RAJA_INLINE char builtin_atomicExchange(char* acc, char value) { return _InterlockedExchange8(acc, value); } -RAJA_INLINE short builtin_atomicExchange(short *acc, short value) +RAJA_INLINE short builtin_atomicExchange(short* acc, short value) { return _InterlockedExchange16(acc, value); } -RAJA_INLINE long builtin_atomicExchange(long *acc, long value) +RAJA_INLINE long builtin_atomicExchange(long* acc, long value) { return _InterlockedExchange(acc, value); } #if defined(_WIN64) -RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value) +RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value) { return _InterlockedExchange64(acc, value); } @@ -173,7 +176,7 @@ RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value) */ template ::value, bool> = true> -RAJA_INLINE void builtin_atomicStore(T *acc, T value) +RAJA_INLINE void builtin_atomicStore(T* acc, T value) { builtin_atomicExchange(acc, value); } @@ -182,24 +185,25 @@ RAJA_INLINE void builtin_atomicStore(T *acc, T value) /*! * Atomic compare and swap using intrinsics */ -RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value) +RAJA_INLINE char builtin_atomicCAS(char* acc, char compare, char value) { return _InterlockedCompareExchange8(acc, value, compare); } -RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value) +RAJA_INLINE short builtin_atomicCAS(short* acc, short compare, short value) { return _InterlockedCompareExchange16(acc, value, compare); } -RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value) +RAJA_INLINE long builtin_atomicCAS(long* acc, long compare, long value) { return _InterlockedCompareExchange(acc, value, compare); } #if defined(_WIN64) -RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value) +RAJA_INLINE long long +builtin_atomicCAS(long long* acc, long long compare, long long value) { return _InterlockedCompareExchange64(acc, value, compare); } @@ -210,24 +214,24 @@ RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long /*! * Atomic addition using intrinsics */ -RAJA_INLINE char builtin_atomicAdd(char *acc, char value) +RAJA_INLINE char builtin_atomicAdd(char* acc, char value) { return _InterlockedExchangeAdd8(acc, value); } -RAJA_INLINE short builtin_atomicAdd(short *acc, short value) +RAJA_INLINE short builtin_atomicAdd(short* acc, short value) { return _InterlockedExchangeAdd16(acc, value); } -RAJA_INLINE long builtin_atomicAdd(long *acc, long value) +RAJA_INLINE long builtin_atomicAdd(long* acc, long value) { return _InterlockedExchangeAdd(acc, value); } #if defined(_WIN64) -RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value) +RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value) { return _InterlockedExchangeAdd64(acc, value); } @@ -238,24 +242,24 @@ RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value) /*! * Atomic subtraction using intrinsics */ -RAJA_INLINE char builtin_atomicSub(char *acc, char value) +RAJA_INLINE char builtin_atomicSub(char* acc, char value) { return _InterlockedExchangeAdd8(acc, -value); } -RAJA_INLINE short builtin_atomicSub(short *acc, short value) +RAJA_INLINE short builtin_atomicSub(short* acc, short value) { return _InterlockedExchangeAdd16(acc, -value); } -RAJA_INLINE long builtin_atomicSub(long *acc, long value) +RAJA_INLINE long builtin_atomicSub(long* acc, long value) { return _InterlockedExchangeAdd(acc, -value); } #if defined(_WIN64) -RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value) +RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value) { return _InterlockedExchangeAdd64(acc, -value); } @@ -266,24 +270,24 @@ RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value) /*! * Atomic and using intrinsics */ -RAJA_INLINE char builtin_atomicAnd(char *acc, char value) +RAJA_INLINE char builtin_atomicAnd(char* acc, char value) { return _InterlockedAnd8(acc, value); } -RAJA_INLINE short builtin_atomicAnd(short *acc, short value) +RAJA_INLINE short builtin_atomicAnd(short* acc, short value) { return _InterlockedAnd16(acc, value); } -RAJA_INLINE long builtin_atomicAnd(long *acc, long value) +RAJA_INLINE long builtin_atomicAnd(long* acc, long value) { return _InterlockedAnd(acc, value); } #if defined(_WIN64) -RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value) +RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value) { return _InterlockedAnd64(acc, value); } @@ -294,24 +298,24 @@ RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value) /*! * Atomic xor using intrinsics */ -RAJA_INLINE char builtin_atomicXor(char *acc, char value) +RAJA_INLINE char builtin_atomicXor(char* acc, char value) { return _InterlockedXor8(acc, value); } -RAJA_INLINE short builtin_atomicXor(short *acc, short value) +RAJA_INLINE short builtin_atomicXor(short* acc, short value) { return _InterlockedXor16(acc, value); } -RAJA_INLINE long builtin_atomicXor(long *acc, long value) +RAJA_INLINE long builtin_atomicXor(long* acc, long value) { return _InterlockedXor(acc, value); } #if defined(_WIN64) -RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value) +RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value) { return _InterlockedXor64(acc, value); } @@ -327,10 +331,11 @@ RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value) * using an intrinsic */ template -struct builtin_useIntrinsic { +struct builtin_useIntrinsic +{ static constexpr bool value = - (std::is_integral::value || std::is_enum::value) && - (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); + (std::is_integral::value || std::is_enum::value) && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); }; @@ -339,54 +344,54 @@ struct builtin_useIntrinsic { * by reinterpreting inputs to types that intrinsics support */ template -struct builtin_useReinterpret { - static constexpr bool value = - !std::is_integral::value && - !std::is_enum::value && - ((sizeof(T) == 1 +struct builtin_useReinterpret +{ + static constexpr bool value = !std::is_integral::value && + !std::is_enum::value && + ((sizeof(T) == 1 #if !defined(UINT8_MAX) - && sizeof(unsigned char) == 1 + && sizeof(unsigned char) == 1 #endif - ) || - (sizeof(T) == 2 + ) || + (sizeof(T) == 2 #if !defined(UINT16_MAX) - && sizeof(unsigned short) == 2 + && sizeof(unsigned short) == 2 #endif - ) || - (sizeof(T) == 4 + ) || + (sizeof(T) == 4 #if !defined(UINT32_MAX) - && sizeof(unsigned int) == 4 + && sizeof(unsigned int) == 4 #endif - ) || - (sizeof(T) == 8 + ) || + (sizeof(T) == 8 #if !defined(UINT64_MAX) - && sizeof(unsigned long long) == 8 + && sizeof(unsigned long long) == 8 #endif - )); + )); using type = - std::conditional_t>>; + uint64_t>>>; #else - unsigned long long>>>; + unsigned long long>>>; #endif }; @@ -396,10 +401,11 @@ struct builtin_useReinterpret { * using a compare and swap loop */ template -struct builtin_useCAS { +struct builtin_useCAS +{ static constexpr bool value = - !std::is_integral::value && !std::is_enum::value && - (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); + !std::is_integral::value && !std::is_enum::value && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); }; @@ -413,7 +419,7 @@ struct builtin_useCAS { */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc) { return __atomic_load_n(acc, __ATOMIC_RELAXED); } @@ -424,7 +430,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value) { __atomic_store_n(acc, value, __ATOMIC_RELAXED); } @@ -435,7 +441,7 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value) { return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED); } @@ -446,10 +452,10 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value) { - __atomic_compare_exchange_n( - acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED, + __ATOMIC_RELAXED); return compare; } @@ -459,7 +465,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value) { return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED); } @@ -470,7 +476,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value) { return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED); } @@ -481,7 +487,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value) { return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED); } @@ -492,7 +498,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value) { return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED); } @@ -503,7 +509,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value) { return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED); } @@ -529,12 +535,12 @@ using builtin_useReinterpret_t = typename builtin_useReinterpret::type; */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc) { using R = builtin_useReinterpret_t; return RAJA::util::reinterp_A_as_B( - builtin_atomicLoad(reinterpret_cast(acc))); + builtin_atomicLoad(reinterpret_cast(acc))); } @@ -543,7 +549,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value) { using R = builtin_useReinterpret_t; @@ -557,13 +563,12 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value) { using R = builtin_useReinterpret_t; - return RAJA::util::reinterp_A_as_B( - builtin_atomicExchange(reinterpret_cast(acc), - RAJA::util::reinterp_A_as_B(value))); + return RAJA::util::reinterp_A_as_B(builtin_atomicExchange( + reinterpret_cast(acc), RAJA::util::reinterp_A_as_B(value))); } @@ -572,14 +577,13 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value) { using R = builtin_useReinterpret_t; - return RAJA::util::reinterp_A_as_B( - builtin_atomicCAS(reinterpret_cast(acc), - RAJA::util::reinterp_A_as_B(compare), - RAJA::util::reinterp_A_as_B(value))); + return RAJA::util::reinterp_A_as_B(builtin_atomicCAS( + reinterpret_cast(acc), RAJA::util::reinterp_A_as_B(compare), + RAJA::util::reinterp_A_as_B(value))); } @@ -594,7 +598,7 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b) +RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b) { return a == b; } @@ -607,7 +611,7 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b) */ template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b) +RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b) { using R = builtin_useReinterpret_t; @@ -622,15 +626,15 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b) * Returns the OLD value that was replaced by the result of this operation. */ template -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc, - Oper &&oper) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper) { T old = builtin_atomicLoad(acc); T expected; - do { + do + { expected = old; - old = builtin_atomicCAS(acc, expected, oper(expected)); + old = builtin_atomicCAS(acc, expected, oper(expected)); } while (!builtin_atomicCAS_equal(old, expected)); return old; @@ -644,21 +648,23 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc, * that was replaced by the result of this operation. */ template -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc, - Oper &&oper, - ShortCircuit &&sc) +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, + Oper&& oper, + ShortCircuit&& sc) { T old = builtin_atomicLoad(acc); - if (sc(old)) { + if (sc(old)) + { return old; } T expected; - do { + do + { expected = old; - old = builtin_atomicCAS(acc, expected, oper(expected)); + old = builtin_atomicCAS(acc, expected, oper(expected)); } while (!builtin_atomicCAS_equal(old, expected) && !sc(old)); return old; @@ -673,65 +679,50 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc, /*! * Atomic addition using compare and swap loop */ -template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value) +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value) { - return builtin_atomicCAS_loop(acc, [value] (T old) { - return old + value; - }); + return builtin_atomicCAS_loop(acc, [value](T old) { return old + value; }); } /*! * Atomic subtraction using compare and swap loop */ -template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value) +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value) { - return builtin_atomicCAS_loop(acc, [value] (T old) { - return old - value; - }); + return builtin_atomicCAS_loop(acc, [value](T old) { return old - value; }); } /*! * Atomic and using compare and swap loop */ -template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value) +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value) { - return builtin_atomicCAS_loop(acc, [value] (T old) { - return old & value; - }); + return builtin_atomicCAS_loop(acc, [value](T old) { return old & value; }); } /*! * Atomic or using compare and swap loop */ -template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value) +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value) { - return builtin_atomicCAS_loop(acc, [value] (T old) { - return old | value; - }); + return builtin_atomicCAS_loop(acc, [value](T old) { return old | value; }); } /*! * Atomic xor using compare and swap loop */ -template ::value, bool> = true> -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value) +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value) { - return builtin_atomicCAS_loop(acc, [value] (T old) { - return old ^ value; - }); + return builtin_atomicCAS_loop(acc, [value](T old) { return old ^ value; }); } @@ -739,109 +730,105 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value) template -RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc) +RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T* acc) { return detail::builtin_atomicLoad(acc); } template -RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T* acc, T value) { detail::builtin_atomicStore(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T* acc, T value) { return detail::builtin_atomicAdd(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T* acc, T value) { return detail::builtin_atomicSub(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value) { return detail::builtin_atomicCAS_loop( - acc, - [value] (T old) { - return value < old ? value : old; - }, - [value] (T current) { - return current <= value; - }); + acc, [value](T old) { return value < old ? value : old; }, + [value](T current) { return current <= value; }); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value) { return detail::builtin_atomicCAS_loop( - acc, - [value] (T old) { - return old < value ? value : old; - }, - [value] (T current) { - return value <= current; - }); + acc, [value](T old) { return old < value ? value : old; }, + [value](T current) { return value <= current; }); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc) +RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc) { return detail::builtin_atomicAdd(acc, static_cast(1)); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value) { - return detail::builtin_atomicCAS_loop(acc, [value] (T old) { - return value <= old ? static_cast(0) : old + static_cast(1); - }); + return detail::builtin_atomicCAS_loop( + acc, [value](T old) + { return value <= old ? static_cast(0) : old + static_cast(1); }); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc) +RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc) { return detail::builtin_atomicSub(acc, static_cast(1)); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value) { - return detail::builtin_atomicCAS_loop(acc, [value] (T old) { - return old == static_cast(0) || value < old ? value : old - static_cast(1); - }); + return detail::builtin_atomicCAS_loop(acc, + [value](T old) + { + return old == static_cast(0) || + value < old + ? value + : old - static_cast(1); + }); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T* acc, T value) { return detail::builtin_atomicAnd(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T* acc, T value) { return detail::builtin_atomicOr(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T* acc, T value) { return detail::builtin_atomicXor(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T* acc, T value) { return detail::builtin_atomicExchange(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value) +RAJA_DEVICE_HIP RAJA_INLINE T +atomicCAS(builtin_atomic, T* acc, T compare, T value) { return detail::builtin_atomicCAS(acc, compare, value); } diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp index ae00d346ae..e750c6bfc0 100644 --- a/include/RAJA/policy/cuda/kernel/Sync.hpp +++ b/include/RAJA/policy/cuda/kernel/Sync.hpp @@ -85,7 +85,8 @@ struct CudaStatementExecutor #else void exec(Data&, bool) - {} + { + } #endif static inline LaunchDims diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp index e22b3d7d59..1f8c2a5e95 100644 --- a/include/RAJA/policy/openmp/params/forall.hpp +++ b/include/RAJA/policy/openmp/params/forall.hpp @@ -98,8 +98,8 @@ forall_impl(const ExecPol& p, RAJA_OMP_DECLARE_REDUCTION_COMBINE; RAJA_EXTRACT_BED_IT(iter); -#pragma omp parallel for schedule(static, ChunkSize) reduction(combine \ - : f_params) +#pragma omp parallel for schedule(static, ChunkSize) \ + reduction(combine : f_params) for (decltype(distance_it) i = 0; i < distance_it; ++i) { RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]); @@ -202,8 +202,8 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic& p, RAJA_OMP_DECLARE_REDUCTION_COMBINE; RAJA_EXTRACT_BED_IT(iter); -#pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine \ - : f_params) +#pragma omp parallel for schedule(dynamic, ChunkSize) \ + reduction(combine : f_params) for (decltype(distance_it) i = 0; i < distance_it; ++i) { RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]); @@ -257,8 +257,8 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided& p, RAJA_OMP_DECLARE_REDUCTION_COMBINE; RAJA_EXTRACT_BED_IT(iter); -#pragma omp parallel for schedule(guided, ChunkSize) reduction(combine \ - : f_params) +#pragma omp parallel for schedule(guided, ChunkSize) \ + reduction(combine : f_params) for (decltype(distance_it) i = 0; i < distance_it; ++i) { RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]); diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp index ea88a7b2ff..91f8c1d2a2 100644 --- a/include/RAJA/policy/openmp/sort.hpp +++ b/include/RAJA/policy/openmp/sort.hpp @@ -177,7 +177,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp) std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads); RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code -#pragma omp parallel num_threads(static_cast (requested_num_threads)) +#pragma omp parallel num_threads(static_cast(requested_num_threads)) #pragma omp master { sort_task(sorter, begin, 0, n, iterates_per_task, comp); @@ -189,7 +189,7 @@ inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp) (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads); RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code -#pragma omp parallel num_threads(static_cast (requested_num_threads)) +#pragma omp parallel num_threads(static_cast(requested_num_threads)) { sort_parallel_region(sorter, begin, n, comp); } diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp index 4c48a12eda..018b3878d8 100644 --- a/include/RAJA/policy/openmp_target.hpp +++ b/include/RAJA/policy/openmp_target.hpp @@ -30,7 +30,7 @@ #include "RAJA/policy/openmp_target/kernel.hpp" #include "RAJA/policy/openmp_target/forall.hpp" #include "RAJA/policy/openmp_target/reduce.hpp" -//#include "RAJA/policy/openmp_target/multi_reduce.hpp" +// #include "RAJA/policy/openmp_target/multi_reduce.hpp" #include "RAJA/policy/openmp_target/WorkGroup.hpp" diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp index a142b6a606..c61a7d09f5 100644 --- a/include/RAJA/policy/openmp_target/forall.hpp +++ b/include/RAJA/policy/openmp_target/forall.hpp @@ -79,9 +79,7 @@ forall_impl(resources::Omp omp_res, auto i = distance_it; #pragma omp target teams distribute parallel for num_teams(numteams) \ - schedule(static, 1) map(to \ - : body, begin_it) reduction(combine \ - : f_params) + schedule(static, 1) map(to : body, begin_it) reduction(combine : f_params) for (i = 0; i < distance_it; ++i) { Body ib = body; @@ -133,8 +131,7 @@ forall_impl(resources::Omp omp_res, auto i = distance_it; #pragma omp target teams distribute parallel for num_teams(numteams) \ - schedule(static, 1) map(to \ - : body, begin_it) + schedule(static, 1) map(to : body, begin_it) for (i = 0; i < distance_it; ++i) { Body ib = body; @@ -167,8 +164,7 @@ forall_impl(resources::Omp omp_res, RAJA_EXTRACT_BED_IT(iter); #pragma omp target teams distribute parallel for schedule(static, 1) \ - firstprivate(body, begin_it) reduction(combine \ - : f_params) + firstprivate(body, begin_it) reduction(combine : f_params) for (decltype(distance_it) i = 0; i < distance_it; ++i) { Body ib = body; diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp index 34c23fb5db..0364470945 100644 --- a/include/RAJA/policy/openmp_target/params/reduce.hpp +++ b/include/RAJA/policy/openmp_target/params/reduce.hpp @@ -3,37 +3,43 @@ #include "RAJA/pattern/params/reducer.hpp" -namespace RAJA { -namespace expt { -namespace detail { +namespace RAJA +{ +namespace expt +{ +namespace detail +{ #if defined(RAJA_ENABLE_TARGET_OPENMP) - // Init - template - camp::concepts::enable_if< type_traits::is_target_openmp_policy > - init(Reducer& red) { - red.m_valop.val = OP::identity(); - } - - // Combine - template - camp::concepts::enable_if< type_traits::is_target_openmp_policy > - combine(Reducer& out, const Reducer& in) { - out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val); - } - - // Resolve - template - camp::concepts::enable_if< type_traits::is_target_openmp_policy > - resolve(Reducer& red) { - red.combineTarget(red.m_valop.val); - } +// Init +template +camp::concepts::enable_if> +init(Reducer& red) +{ + red.m_valop.val = OP::identity(); +} + +// Combine +template +camp::concepts::enable_if> +combine(Reducer& out, const Reducer& in) +{ + out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val); +} + +// Resolve +template +camp::concepts::enable_if> +resolve(Reducer& red) +{ + red.combineTarget(red.m_valop.val); +} #endif -} // namespace detail -} // namespace expt -} // namespace RAJA +} // namespace detail +} // namespace expt +} // namespace RAJA -#endif // NEW_REDUCE_OMP_REDUCE_HPP +#endif // NEW_REDUCE_OMP_REDUCE_HPP diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp index 8bcbde620d..0470c52136 100644 --- a/include/RAJA/policy/openmp_target/reduce.hpp +++ b/include/RAJA/policy/openmp_target/reduce.hpp @@ -12,7 +12,7 @@ #if defined(RAJA_ENABLE_TARGET_OPENMP) -//#include // Leaving out until XL is fixed 2/25/2019. +// #include // Leaving out until XL is fixed 2/25/2019. #include diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp index 491e39910c..81f16d4918 100644 --- a/include/RAJA/policy/sycl.hpp +++ b/include/RAJA/policy/sycl.hpp @@ -29,13 +29,13 @@ #include "RAJA/policy/sycl/forall.hpp" #include "RAJA/policy/sycl/policy.hpp" #include "RAJA/policy/sycl/reduce.hpp" -//#include "RAJA/policy/sycl/multi_reduce.hpp" -//#include "RAJA/policy/sycl/scan.hpp" -//#include "RAJA/policy/sycl/sort.hpp" +// #include "RAJA/policy/sycl/multi_reduce.hpp" +// #include "RAJA/policy/sycl/scan.hpp" +// #include "RAJA/policy/sycl/sort.hpp" #include "RAJA/policy/sycl/kernel.hpp" -//#include "RAJA/policy/sycl/synchronize.hpp" +// #include "RAJA/policy/sycl/synchronize.hpp" #include "RAJA/policy/sycl/launch.hpp" -//#include "RAJA/policy/sycl/WorkGroup.hpp" +// #include "RAJA/policy/sycl/WorkGroup.hpp" #endif // closing endif for if defined(RAJA_ENABLE_SYCL) diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp index 641c3a9ef3..803bcd49e0 100644 --- a/include/RAJA/policy/sycl/kernel.hpp +++ b/include/RAJA/policy/sycl/kernel.hpp @@ -23,11 +23,11 @@ #include "RAJA/policy/sycl/kernel/SyclKernel.hpp" #include "RAJA/policy/sycl/kernel/For.hpp" #include "RAJA/policy/sycl/kernel/ForICount.hpp" -//#include "RAJA/policy/sycl/kernel/Hyperplane.hpp" -//#include "RAJA/policy/sycl/kernel/InitLocalMem.hpp" +// #include "RAJA/policy/sycl/kernel/Hyperplane.hpp" +// #include "RAJA/policy/sycl/kernel/InitLocalMem.hpp" #include "RAJA/policy/sycl/kernel/Lambda.hpp" -//#include "RAJA/policy/sycl/kernel/Reduce.hpp" -//#include "RAJA/policy/sycl/kernel/Sync.hpp" +// #include "RAJA/policy/sycl/kernel/Reduce.hpp" +// #include "RAJA/policy/sycl/kernel/Sync.hpp" #include "RAJA/policy/sycl/kernel/Tile.hpp" #include "RAJA/policy/sycl/kernel/TileTCount.hpp" #include "RAJA/policy/sycl/kernel/internal.hpp" diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp index 4c79a279d9..d36a7fa2af 100644 --- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp +++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp @@ -143,12 +143,9 @@ struct SyclLaunchHelper, StmtList, Data, Types> qu->submit( [&](cl::sycl::handler& h) { - h.parallel_for(launch_dims.fit_nd_range(qu), - [=](cl::sycl::nd_item<3> item) - { - SyclKernelLauncher(*m_data, - item); - }); + h.parallel_for( + launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item) + { SyclKernelLauncher(*m_data, item); }); }) .wait(); // Need to wait to free memory @@ -183,9 +180,7 @@ struct SyclLaunchHelper, StmtList, Data, Types> { h.parallel_for(launch_dims.fit_nd_range(qu), [=](cl::sycl::nd_item<3> item) - { - SyclKernelLauncher(data, item); - }); + { SyclKernelLauncher(data, item); }); }); if (!async) diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index fcffc88aed..5cef8f570d 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -22,7 +22,7 @@ #include "RAJA/pattern/detail/privatizer.hpp" #include "RAJA/policy/sycl/policy.hpp" #include "RAJA/policy/sycl/MemUtils_SYCL.hpp" -//#include "RAJA/policy/sycl/raja_syclerrchk.hpp" +// #include "RAJA/policy/sycl/raja_syclerrchk.hpp" #include "RAJA/util/resource.hpp" namespace RAJA diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp index d1b15538ae..be3db700a6 100644 --- a/include/RAJA/util/View.hpp +++ b/include/RAJA/util/View.hpp @@ -141,15 +141,16 @@ removenth(Lay lyout, Tup&& tup) -> decltype(selecttuple( // the index into the array-of-pointers to be moved around in the MultiView // operator(); see the operator overload. Default of 0 means that the p2p index // is in the 0th position. -template // removes - // * - >>>>> +template < + typename ValueType, + typename LayoutType, + RAJA::Index_type P2Pidx = 0, + typename PointerType = ValueType**, + typename NonConstPointerType = camp::type::ptr::add< // adds * + camp::type::ptr::add // removes + // * + >>>>> struct MultiView { using value_type = ValueType; diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 4e185591f3..b38e82e45b 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -246,7 +246,7 @@ using Real_type = double; #elif defined(RAJA_USE_FLOAT) /// -using Real_type = float; +using Real_type = float; #else #error RAJA Real_type is undefined! @@ -845,8 +845,8 @@ using UnalignedReal_ptr = Real_type*; using const_UnalignedReal_ptr = const Real_type*; #elif defined(RAJA_USE_RESTRICT_PTR) -using Real_ptr = Real_type* RAJA_RESTRICT; -using const_Real_ptr = const Real_type* RAJA_RESTRICT; +using Real_ptr = Real_type* RAJA_RESTRICT; +using const_Real_ptr = const Real_type* RAJA_RESTRICT; #if defined(RAJA_USE_COMPLEX) using Complex_ptr = Complex_type* RAJA_RESTRICT; @@ -857,24 +857,24 @@ using UnalignedReal_ptr = Real_type* RAJA_RESTRICT; using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT; #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR) -using Real_ptr = TDRAReal_ptr; -using const_Real_ptr = const_TDRAReal_ptr; +using Real_ptr = TDRAReal_ptr; +using const_Real_ptr = const_TDRAReal_ptr; #if defined(RAJA_USE_COMPLEX) -using Complex_ptr = Complex_type* RAJA_RESTRICT; -using const_Complex_ptr = const Complex_type* RAJA_RESTRICT; +using Complex_ptr = Complex_type* RAJA_RESTRICT; +using const_Complex_ptr = const Complex_type* RAJA_RESTRICT; #endif using UnalignedReal_ptr = Real_type* RAJA_RESTRICT; using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT; #elif defined(RAJA_USE_PTR_CLASS) -using Real_ptr = RestrictAlignedRealPtr; -using const_Real_ptr = ConstRestrictAlignedRealPtr; +using Real_ptr = RestrictAlignedRealPtr; +using const_Real_ptr = ConstRestrictAlignedRealPtr; #if defined(RAJA_USE_COMPLEX) -using Complex_ptr = RestrictComplexPtr; -using const_Complex_ptr = ConstRestrictComplexPtr; +using Complex_ptr = RestrictComplexPtr; +using const_Complex_ptr = ConstRestrictComplexPtr; #endif using UnalignedReal_ptr = RestrictRealPtr; diff --git a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp index 0adce05b3d..97dccdfead 100644 --- a/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp +++ b/test/functional/forall/atomic-ref/tests/test-forall-AtomicRefCAS.hpp @@ -64,7 +64,8 @@ struct CompareExchangeWeakOtherOp : all_op { T expect = (T)0; while (!other.compare_exchange_weak(expect, (T)i)) - {} + { + } return expect; } RAJA::AtomicRef other; @@ -92,7 +93,8 @@ struct CompareExchangeStrongOtherOp : all_op { T expect = (T)0; while (!other.compare_exchange_strong(expect, (T)i)) - {} + { + } return expect; } RAJA::AtomicRef other; diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp index b54eec5b05..bcf51593d6 100644 --- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp +++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp @@ -176,11 +176,12 @@ struct MultiLambdaParamNestedLoopExec typename camp::at>::type, RAJA::statement::Lambda<1> // inner loop: dot += ... >, - RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set - // C(row, - // col) - // = - // dot + RAJA::statement:: + Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set + // C(row, + // col) + // = + // dot >>>; }; diff --git a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp index 699a6ff776..f4dd3648a0 100644 --- a/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp +++ b/test/functional/kernel/reduce-loc/tests/test-kernel-reduceloc-Max2DViewTuple.hpp @@ -56,7 +56,7 @@ void KernelLocMax2DViewTupleTestImpl(const int xdim, const int ydim) work_res.memcpy(work_array, check_array, sizeof(DATA_TYPE) * array_length); #if defined(RAJA_ENABLE_TARGET_OPENMP) - //#pragma omp target data map(to:work_array[0:array_length]) + // #pragma omp target data map(to:work_array[0:array_length]) #endif RAJA::TypedRangeSegment colrange(0, xdim); diff --git a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp index ae6d7e384a..bb64d5424b 100644 --- a/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp +++ b/test/functional/launch/nested_direct/tests/test-launch-nested-Direct.hpp @@ -161,9 +161,7 @@ void LaunchNestedDirectTestImpl(INDEX_TYPE M) RAJA::loop( ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) - { - working_array[0]++; - }); + { working_array[0]++; }); }); }); }); diff --git a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp index cd90bf2298..c9192b6718 100644 --- a/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp +++ b/test/functional/launch/nested_loop/tests/test-launch-nested-Loop.hpp @@ -163,9 +163,7 @@ void LaunchNestedLoopTestImpl(INDEX_TYPE M) RAJA::loop( ctx, r1, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) - { - working_array[0]++; - }); + { working_array[0]++; }); }); }); }); diff --git a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp index 4a212875f0..20a4e10ac6 100644 --- a/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp +++ b/test/functional/launch/nested_tile_direct/tests/test-launch-nested-Tile-Direct.hpp @@ -145,9 +145,7 @@ void LaunchNestedTileDirectTestImpl(INDEX_TYPE M) RAJA::loop( ctx, x_tile, [&](INDEX_TYPE RAJA_UNUSED_ARG(tx)) - { - working_array[0]++; - }); + { working_array[0]++; }); }); }); }); diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp index d988dd8e55..4457687cae 100644 --- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp +++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp @@ -45,23 +45,23 @@ using TensorMatrixTypes = ::testing::Types< #endif -//#ifdef __AVX__ -// RAJA::expt::RectMatrixRegister, -// RAJA::expt::RectMatrixRegister, -// RAJA::expt::RectMatrixRegister, -// RAJA::expt::RectMatrixRegister, -// RAJA::expt::RectMatrixRegister, -// RAJA::expt::RectMatrixRegister, -// RAJA::expt::RectMatrixRegister, +// #ifdef __AVX__ +// RAJA::expt::RectMatrixRegister, +// RAJA::expt::RectMatrixRegister, +// RAJA::expt::RectMatrixRegister, +// RAJA::expt::RectMatrixRegister, +// RAJA::expt::RectMatrixRegister, +// RAJA::expt::RectMatrixRegister, +// RAJA::expt::RectMatrixRegister, // -//#endif +// #endif #ifdef __AVX2__ diff --git a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp index da498db615..3b1111b6ef 100644 --- a/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp +++ b/test/functional/tensor/vector/tests/test-tensor-vector-ForallVectorRef2d.hpp @@ -110,12 +110,9 @@ ForallVectorRef2dImpl() C[i] = 0.0; } - RAJA::forall(RAJA::TypedRangeSegment(0, M), - [=](index_t j) - { - Z(all, j) = - 3 + (X(all, j) * (5 / Y(all, j))) + 9; - }); + RAJA::forall( + RAJA::TypedRangeSegment(0, M), + [=](index_t j) { Z(all, j) = 3 + (X(all, j) * (5 / Y(all, j))) + 9; }); for (index_t i = 0; i < N * M; i++) { @@ -131,12 +128,9 @@ ForallVectorRef2dImpl() C[i] = 0.0; } - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=](index_t i) - { - Z(i, all) = - 3 + (X(i, all) * (5 / Y(i, all))) + 9; - }); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), + [=](index_t i) { Z(i, all) = 3 + (X(i, all) * (5 / Y(i, all))) + 9; }); for (index_t i = 0; i < N * M; i++) { diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp index 4b4c786784..7a96d914ae 100644 --- a/test/include/RAJA_gtest.hpp +++ b/test/include/RAJA_gtest.hpp @@ -81,20 +81,22 @@ #define GPU_TYPED_TEST_P(SuiteName, TestName) \ namespace GTEST_SUITE_NAMESPACE_(SuiteName) \ { \ - template \ - class TestName : public SuiteName \ - { \ - private: \ - typedef SuiteName TestFixture; \ - typedef gtest_TypeParam_ TypeParam; \ + template \ + class TestName : public SuiteName \ + { \ + private: \ + typedef SuiteName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ \ - public: \ - void TestBody() override; \ - }; \ - static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ - GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \ - __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName), \ - GTEST_STRINGIFY_(TestName)); \ + public: \ + void TestBody() override; \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \ + __FILE__, \ + __LINE__, \ + GTEST_STRINGIFY_(SuiteName), \ + GTEST_STRINGIFY_(TestName)); \ } \ template \ void GTEST_SUITE_NAMESPACE_( \ diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp index b755677c2e..66fc6f9c7a 100644 --- a/test/include/RAJA_test-reducepol.hpp +++ b/test/include/RAJA_test-reducepol.hpp @@ -29,7 +29,7 @@ using OpenMPReducePols = #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - using OpenMPTargetReducePols = camp::list; +using OpenMPTargetReducePols = camp::list; #endif #if defined(RAJA_ENABLE_CUDA) diff --git a/test/old-tests/unit/test-sharedmem.cpp b/test/old-tests/unit/test-sharedmem.cpp index 702d14c8d8..504a850576 100644 --- a/test/old-tests/unit/test-sharedmem.cpp +++ b/test/old-tests/unit/test-sharedmem.cpp @@ -692,8 +692,8 @@ using CUDATypes = ::testing::Types< RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<1>>>, - RAJA::statement::CudaSyncThreads> // close shared memory - // scope + RAJA::statement::CudaSyncThreads> // close shared memory + // scope > // for 2 > // for 3 > // CudaKernel @@ -728,8 +728,8 @@ using CUDATypes = ::testing::Types< RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<1>>>, - RAJA::statement::CudaSyncThreads> // close shared memory - // scope + RAJA::statement::CudaSyncThreads> // close shared memory + // scope > // for 2 > // for 3 > // CudaKernel @@ -772,8 +772,8 @@ using HIPTypes = ::testing::Types< RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<1>>>, - RAJA::statement::HipSyncThreads> // close shared memory - // scope + RAJA::statement::HipSyncThreads> // close shared memory + // scope > // for 2 > // for 3 > // HipKernel @@ -808,8 +808,8 @@ using HIPTypes = ::testing::Types< RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<1>>>, - RAJA::statement::HipSyncThreads> // close shared memory - // scope + RAJA::statement::HipSyncThreads> // close shared memory + // scope > // for 2 > // for 3 > // HipKernel @@ -918,34 +918,24 @@ GPU_TYPED_TEST_P(MatMultiply, shmem) // Zero out thread local memory for storing dot products [=] RAJA_HOST_DEVICE(int tn, int tp, ThreadPriv& pVal) - { - pVal(tn, tp) = 0.0; - }, + { pVal(tn, tp) = 0.0; }, // Load tile of A [=] RAJA_HOST_DEVICE(int n, int m, int tn, int tm, Shmem& aShared) - { - aShared(tn, tm) = Aview(n, m); - }, + { aShared(tn, tm) = Aview(n, m); }, // Load tile of B [=] RAJA_HOST_DEVICE(int m, int p, int tm, int tp, Shmem& bShared) - { - bShared(tm, tp) = Bview(m, p); - }, + { bShared(tm, tp) = Bview(m, p); }, // Do partial update in shmem [=] RAJA_HOST_DEVICE(int tn, int tm, int tp, Shmem& aShared, Shmem& bShared, ThreadPriv& pVal) - { - pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); - }, + { pVal(tn, tp) += aShared(tn, tm) * bShared(tm, tp); }, // Write out complete result [=] RAJA_HOST_DEVICE(int n, int p, int tn, int tp, ThreadPriv& pVal) - { - Cview(n, p) = pVal(tn, tp); - }); + { Cview(n, p) = pVal(tn, tp); }); // copy result back to host (NOP on CPU) TypeParam::copy_d2h(N * P, C, d_C); @@ -1069,10 +1059,11 @@ struct Policy_MatMultiply_cpu RAJA::statement::For< 1, RAJA::seq_exec, - RAJA::statement::For<0, - RAJA::seq_exec, - shmem_Lambda3>>>>, // sliding - // window + RAJA::statement::For< + 0, + RAJA::seq_exec, + shmem_Lambda3>>>>, // sliding + // window // Write memory out to global matrix RAJA::statement::For<