From 65fd2d47db4c07c6619d3a734b2c058ad24dfdf8 Mon Sep 17 00:00:00 2001 From: john bowen Date: Tue, 5 Nov 2024 11:31:19 -0800 Subject: [PATCH] Clang format --- .clang-format | 7 +- benchmark/benchmark-atomic.cpp | 626 ++-- benchmark/host-device-lambda-benchmark.cpp | 23 +- benchmark/ltimes.cpp | 2987 ++++++++++---------- benchmark/raja_view_blur.cpp | 231 +- cmake/RAJAMacros.cmake | 16 +- 6 files changed, 1988 insertions(+), 1902 deletions(-) diff --git a/.clang-format b/.clang-format index b6fa54b233..898f791bf7 100644 --- a/.clang-format +++ b/.clang-format @@ -31,7 +31,7 @@ BraceWrapping: AfterExternBlock: false BeforeCatch: true BeforeElse: true - BeforeLambdaBody: true + BeforeLambdaBody: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false @@ -40,12 +40,13 @@ BraceWrapping: # Pointer alignment DerivePointerAlignment: false PointerAlignment: Left + +# Single line config AllowShortIfStatementsOnASingleLine : true AllowShortFunctionsOnASingleLine : true AllowShortLoopsOnASingleLine : false AllowAllArgumentsOnNextLine : true AllowAllParametersOfDeclarationOnNextLine : false -AlignTrailingComments : true BinPackArguments : true BinPackParameters : false ConstructorInitializerAllOnOneLineOrOnePerLine : true @@ -67,4 +68,4 @@ SpacesInCStyleCastParentheses: false SpacesInContainerLiterals: false SpacesInConditionalStatement: false SpacesInParentheses: false -SpacesInSquareBrackets: false +SpacesInSquareBrackets: false \ No newline at end of file diff --git a/benchmark/benchmark-atomic.cpp b/benchmark/benchmark-atomic.cpp index ebe3a858ff..ecb316b451 100644 --- a/benchmark/benchmark-atomic.cpp +++ b/benchmark/benchmark-atomic.cpp @@ -5,15 +5,16 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // This file is intended to provide an interface for comparing the performance -// of RAJA's atomic implementations with Desul's atomic implementations. In order -// to accomplish this without requiring two separate build system configurations -// this file directly includes "desul/atomics.hpp" and directly calls desul namespace -// atomics. This is different from how a typical RAJA user would call a desul atomic. +// of RAJA's atomic implementations with Desul's atomic implementations. In +// order to accomplish this without requiring two separate build system +// configurations this file directly includes "desul/atomics.hpp" and directly +// calls desul namespace atomics. This is different from how a typical RAJA +// user would call a desul atomic. #include "RAJA/RAJA.hpp" #include "RAJA/util/for_each.hpp" #include "RAJA/util/Timer.hpp" -#if defined (RAJA_ENABLE_OPENMP) +#if defined(RAJA_ENABLE_OPENMP) #include "RAJA/policy/openmp/atomic.hpp" #endif #include "desul/atomics.hpp" @@ -25,85 +26,99 @@ /// Conditional compilation for CUDA benchmarks. -#if defined (RAJA_ENABLE_CUDA) +#if defined(RAJA_ENABLE_CUDA) #include "RAJA/policy/cuda/atomic.hpp" -template -struct ExecPolicyGPU { - using policy = RAJA::cuda_exec; - static std::string PolicyName() { - std::stringstream ss; - ss << "CUDA execution with block size " << BLOCK_SZ; - return ss.str(); - } +template +struct ExecPolicyGPU +{ + using policy = RAJA::cuda_exec; + static std::string PolicyName() + { + std::stringstream ss; + ss << "CUDA execution with block size " << BLOCK_SZ; + return ss.str(); + } }; -struct GPUAtomic { - using policy = RAJA::policy::cuda::cuda_atomic; +struct GPUAtomic +{ + using policy = RAJA::policy::cuda::cuda_atomic; }; -#elif defined (RAJA_ENABLE_HIP) +#elif defined(RAJA_ENABLE_HIP) #include "RAJA/policy/hip/atomic.hpp" -template -struct ExecPolicyGPU { - using policy = RAJA::hip_exec; - static std::string PolicyName() { - std::stringstream ss; - ss << "HIP execution with block size " << BLOCK_SZ; - return ss.str(); - } +template +struct ExecPolicyGPU +{ + using policy = RAJA::hip_exec; + static std::string PolicyName() + { + std::stringstream ss; + ss << "HIP execution with block size " << BLOCK_SZ; + return ss.str(); + } }; -struct GPUAtomic { - using policy = RAJA::policy::hip::hip_atomic; +struct GPUAtomic +{ + using policy = RAJA::policy::hip::hip_atomic; }; #endif #define BLOCK_SZ 256 -#define INDENT " " +#define INDENT " " using raja_default_desul_order = desul::MemoryOrderRelaxed; using raja_default_desul_scope = desul::MemoryScopeDevice; -// Desul atomics have a different signature than RAJA's built in ops. The following code provides some -// helper function templates so that they can be called using the same signature in timing code. +// Desul atomics have a different signature than RAJA's built in ops. The +// following code provides some helper function templates so that they can be +// called using the same signature in timing code. // Struct holding Desul atomic signature typedef -template -struct DesulAtomicSignature { - using type = ReturnType(*)(Args..., raja_default_desul_order, raja_default_desul_scope); +template +struct DesulAtomicSignature +{ + using type = ReturnType (*)(Args..., + raja_default_desul_order, + raja_default_desul_scope); }; // Struct holding RAJA atomic signature typedef -template -struct RajaAtomicSignature { - using type = AtomicType(*)(AtomicType*, const AtomicType); +template +struct RajaAtomicSignature +{ + using type = AtomicType (*)(AtomicType*, const AtomicType); }; -/// RAJA::atomicAdd and other RAJA namespace atomic calls are overloaded and have an ambiguous type -/// so they can't be passed as a template parameter. -/// The following macro disambiguates the call and provide a signature comaptible with the DESUL -/// wrapper. AtomicOperation must be a valid RAJA namespace atomic operation, like atomicAdd, -/// atomicMax, etc. -#define OPERATOR_CALL_BINARY(AtomicOperation) \ - template \ - RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType val) const { \ - return RAJA::AtomicOperation(Policy {}, acc, val); \ - } \ - -#define OPERATOR_CALL_UNARY(AtomicOperation) \ - template \ - RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType) const { \ - return RAJA::AtomicOperation(Policy {}, acc); \ - } \ - -#define DECLARE_ATOMIC_WRAPPER(AtomicFunctorName, AtomicOperatorDeclaration) \ -template \ -struct AtomicFunctorName { \ - const char* name = #AtomicFunctorName ; \ - AtomicOperatorDeclaration \ -}; \ +/// RAJA::atomicAdd and other RAJA namespace atomic calls are overloaded and +/// have an ambiguous type so they can't be passed as a template parameter. The +/// following macro disambiguates the call and provide a signature comaptible +/// with the DESUL wrapper. AtomicOperation must be a valid RAJA namespace +/// atomic operation, like atomicAdd, atomicMax, etc. +#define OPERATOR_CALL_BINARY(AtomicOperation) \ + template \ + RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType val) const \ + { \ + return RAJA::AtomicOperation(Policy {}, acc, val); \ + } + +#define OPERATOR_CALL_UNARY(AtomicOperation) \ + template \ + RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, const ArgType) const \ + { \ + return RAJA::AtomicOperation(Policy {}, acc); \ + } + +#define DECLARE_ATOMIC_WRAPPER(AtomicFunctorName, AtomicOperatorDeclaration) \ + template \ + struct AtomicFunctorName \ + { \ + const char* name = #AtomicFunctorName; \ + AtomicOperatorDeclaration \ + }; DECLARE_ATOMIC_WRAPPER(AtomicAdd, OPERATOR_CALL_BINARY(atomicAdd)) DECLARE_ATOMIC_WRAPPER(AtomicSub, OPERATOR_CALL_BINARY(atomicSub)) @@ -119,228 +134,365 @@ DECLARE_ATOMIC_WRAPPER(AtomicXor, OPERATOR_CALL_BINARY(atomicXor)) DECLARE_ATOMIC_WRAPPER(AtomicExchange, OPERATOR_CALL_BINARY(atomicExchange)) DECLARE_ATOMIC_WRAPPER(AtomicLoad, OPERATOR_CALL_UNARY(atomicLoad)) -/// Instead of complicating the above macro to handle these two atomics, do the declarations -/// manually below. -template -struct AtomicStore { - const char* name = "AtomicStore"; - template - RAJA_HOST_DEVICE void operator()(ArgType* acc, const ArgType val) const { - return RAJA::atomicStore(Policy {}, acc, val); - } +/// Instead of complicating the above macro to handle these two atomics, do the +/// declarations manually below. +template +struct AtomicStore +{ + const char* name = "AtomicStore"; + template + RAJA_HOST_DEVICE void operator()(ArgType* acc, const ArgType val) const + { + return RAJA::atomicStore(Policy {}, acc, val); + } }; -template -struct AtomicCAS { - const char* name = "AtomicCAS"; - template - RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, ArgType compare) const { - return RAJA::atomicCAS(Policy {}, acc, compare, ArgType(1)); - } +template +struct AtomicCAS +{ + const char* name = "AtomicCAS"; + template + RAJA_HOST_DEVICE ArgType operator()(ArgType* acc, ArgType compare) const + { + return RAJA::atomicCAS(Policy {}, acc, compare, ArgType(1)); + } }; /// ExecPolicy wrapper for OpenMP -struct ExecPolicyOMP { - using policy = RAJA::omp_parallel_for_exec;; - static std::string PolicyName() { - std::stringstream ss; - ss << "OpenMP execution"; - return ss.str(); - } +struct ExecPolicyOMP +{ + using policy = RAJA::omp_parallel_for_exec; + ; + static std::string PolicyName() + { + std::stringstream ss; + ss << "OpenMP execution"; + return ss.str(); + } }; -/// Functor wrapping the desul implementation. Wrapping the desul call ensure an identical signature with -/// RAJA's implementations. Wrapping the call in an functor allows simple type deduction for printing -/// from within the benchmark. -template::type atomic_impl> -struct atomicWrapperDesulTernary { - /// Call operator overload template that allows invoking DESUL atomic with a (int*)(T*, T) signature - RAJA_HOST_DEVICE ReturnType operator()(T * acc, T value) const { - return atomic_impl(acc, value, T(1), raja_default_desul_order{}, - raja_default_desul_scope{}); - } +/// Functor wrapping the desul implementation. Wrapping the desul call ensure +/// an identical signature with RAJA's implementations. Wrapping the call in an +/// functor allows simple type deduction for printing from within the benchmark. +template < + typename T, + typename ReturnType, + typename DesulAtomicSignature::type atomic_impl> +struct atomicWrapperDesulTernary +{ + /// Call operator overload template that allows invoking DESUL atomic with a + /// (int*)(T*, T) signature + RAJA_HOST_DEVICE ReturnType operator()(T* acc, T value) const + { + return atomic_impl(acc, value, T(1), raja_default_desul_order {}, + raja_default_desul_scope {}); + } }; -template::type atomic_impl> -struct atomicWrapperDesulBinary { - /// Call operator overload template that allows invoking DESUL atomic with a (int*)(T*, T) signature - RAJA_HOST_DEVICE ReturnType operator()(T * acc, T value) const { - return atomic_impl(acc, value, raja_default_desul_order{}, - raja_default_desul_scope{}); - } +template < + typename T, + typename ReturnType, + typename DesulAtomicSignature::type atomic_impl> +struct atomicWrapperDesulBinary +{ + /// Call operator overload template that allows invoking DESUL atomic with a + /// (int*)(T*, T) signature + RAJA_HOST_DEVICE ReturnType operator()(T* acc, T value) const + { + return atomic_impl(acc, value, raja_default_desul_order {}, + raja_default_desul_scope {}); + } }; /// Unary wrapper variant for increment and decrement benchmarks. -template::type atomic_impl> -struct atomicWrapperDesulUnary { - RAJA_HOST_DEVICE ReturnType operator()(T* acc, T) const { - return atomic_impl(acc, raja_default_desul_order{}, - raja_default_desul_scope{}); - } +template ::type atomic_impl> +struct atomicWrapperDesulUnary +{ + RAJA_HOST_DEVICE ReturnType operator()(T* acc, T) const + { + return atomic_impl(acc, raja_default_desul_order {}, + raja_default_desul_scope {}); + } }; -template -class IsDesul : public std::false_type {}; - -template::type atomic_impl> -class IsDesul> : public std::true_type {}; - -template::type atomic_impl> -class IsDesul> : public std::true_type {}; - -template::type atomic_impl> -class IsDesul> : public std::true_type {}; - -template -std::string GetImplName (const AtomicImplType& impl) { - if (IsDesul::value) { - return "Desul atomic"; - } else { - return "RAJA atomic"; - } +template +class IsDesul : public std::false_type +{}; + +template ::type atomic_impl> +class IsDesul> + : public std::true_type +{}; + +template ::type atomic_impl> +class IsDesul> + : public std::true_type +{}; + +template ::type atomic_impl> +class IsDesul> + : public std::true_type +{}; + +template +std::string GetImplName(const AtomicImplType& impl) +{ + if (IsDesul::value) + { + return "Desul atomic"; + } + else + { + return "RAJA atomic"; + } } -template -void TimeAtomicOp(const AtomicImplType& atomic_impl, uint64_t N, uint64_t num_iterations = 4, int array_size = 100, bool print_to_output = true) { - RAJA::Timer timer; - - // Allocate memory - AtomicType* device_value = nullptr; - int len_array = test_array ? array_size : 1; - camp::resources::Resource resource {RAJA::resources::get_resource::type::get_default()}; - device_value = resource.allocate(len_array); - - timer.start(); - if (test_array) { - for (uint64_t i = 0; i < num_iterations; ++i) { - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=] RAJA_HOST_DEVICE(uint64_t tid) { - atomic_impl(&(device_value[tid % array_size]), AtomicType(1)); - }); - } - } else { - for (uint64_t i = 0; i < num_iterations; ++i) { - RAJA::forall(RAJA::TypedRangeSegment(0, N), - [=] RAJA_HOST_DEVICE(uint64_t tid) { - atomic_impl(device_value, AtomicType(1)); - }); - } +template +void TimeAtomicOp(const AtomicImplType& atomic_impl, + uint64_t N, + uint64_t num_iterations = 4, + int array_size = 100, + bool print_to_output = true) +{ + RAJA::Timer timer; + + // Allocate memory + AtomicType* device_value = nullptr; + int len_array = test_array ? array_size : 1; + camp::resources::Resource resource {RAJA::resources::get_resource< + typename ExecPolicy::policy>::type::get_default()}; + device_value = resource.allocate(len_array); + + timer.start(); + if (test_array) + { + for (uint64_t i = 0; i < num_iterations; ++i) + { + RAJA::forall( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_HOST_DEVICE(uint64_t tid) + { atomic_impl(&(device_value[tid % array_size]), AtomicType(1)); }); } - - resource.wait(); - timer.stop(); - resource.deallocate(device_value); - - double t = timer.elapsed(); - if (print_to_output) { - std::cout << INDENT << INDENT << t << "s" << INDENT; - std::cout << GetImplName(atomic_impl) << ", "; - if (test_array) { - std::cout << "Number of atomics under contention " << array_size << ", "; - } - std::cout << num_iterations * N << " many atomic operations" << ", "; - std::cout << ExecPolicy::PolicyName(); - std::cout << std::endl; + } + else + { + for (uint64_t i = 0; i < num_iterations; ++i) + { + RAJA::forall( + RAJA::TypedRangeSegment(0, N), + [=] RAJA_HOST_DEVICE(uint64_t tid) + { atomic_impl(device_value, AtomicType(1)); }); + } + } + + resource.wait(); + timer.stop(); + resource.deallocate(device_value); + + double t = timer.elapsed(); + if (print_to_output) + { + std::cout << INDENT << INDENT << t << "s" << INDENT; + std::cout << GetImplName(atomic_impl) << ", "; + if (test_array) + { + std::cout << "Number of atomics under contention " << array_size << ", "; } + std::cout << num_iterations * N << " many atomic operations" + << ", "; + std::cout << ExecPolicy::PolicyName(); + std::cout << std::endl; + } } -template +template struct list_concat; -template -struct list_concat, camp::list> { - using type = camp::list; +template +struct list_concat, camp::list> +{ + using type = camp::list; }; -/// Holder for atomic operations that work with arbitrary atomic type, e.g. double, float -/// and ints etc. -template -struct universal_atomic_ops { - using type = camp::list, AtomicAdd>, - std::pair, AtomicSub>, - std::pair, AtomicMin>, - std::pair, AtomicMax>, - std::pair, AtomicIncBinary>, - std::pair, AtomicDecBinary>, - std::pair, AtomicIncUnary>, - std::pair, AtomicDecUnary>, - std::pair, AtomicLoad>, - std::pair, AtomicStore>, - std::pair, AtomicExchange>, - std::pair, AtomicCAS>>; +/// Holder for atomic operations that work with arbitrary atomic type, e.g. +/// double, float and ints etc. +template +struct universal_atomic_ops +{ + using type = camp::list< + std::pair, + AtomicAdd>, + std::pair, + AtomicSub>, + std::pair, + AtomicMin>, + std::pair, + AtomicMax>, + std::pair, + AtomicIncBinary>, + std::pair, + AtomicDecBinary>, + std::pair, + AtomicIncUnary>, + std::pair, + AtomicDecUnary>, + std::pair, + AtomicLoad>, + std::pair< + atomicWrapperDesulBinary, + AtomicStore>, + std::pair, + AtomicExchange>, + std::pair, + AtomicCAS>>; }; -template -struct integral_atomic_ops { - using type = camp::list, AtomicAnd>, - std::pair, AtomicOr>, - std::pair, AtomicXor>>; +template +struct integral_atomic_ops +{ + using type = + camp::list, + AtomicAnd>, + std::pair, + AtomicOr>, + std::pair, + AtomicXor>>; }; -template +template struct atomic_ops; /// Include all atomic ops if the underlying atomic to benchmark is integral. -template -struct atomic_ops::value>::type> { - using type = typename list_concat::type, typename integral_atomic_ops::type>::type; +template +struct atomic_ops< + AtomicDataType, + Policy, + typename std::enable_if::value>::type> +{ + using type = typename list_concat< + typename universal_atomic_ops::type, + typename integral_atomic_ops::type>::type; }; /// Omit bitwise ops and, or, and xor for floating point types -template -struct atomic_ops::value>::type> { - using type = typename universal_atomic_ops< AtomicDataType, Policy >::type; +template +struct atomic_ops::value>::type> +{ + using type = typename universal_atomic_ops::type; }; -template -void ExecuteBenchmark(uint64_t N) { - using ops = atomic_ops; - using iter_t = typename ops::type; - auto iter = iter_t{}; - RAJA::for_each_type(iter, [&](auto type_pair) { +template +void ExecuteBenchmark(uint64_t N) +{ + using ops = atomic_ops; + using iter_t = typename ops::type; + auto iter = iter_t {}; + RAJA::for_each_type( + iter, + [&](auto type_pair) + { auto desul_functor = type_pair.first; - auto raja_functor = type_pair.second; - std::cout << INDENT << "Executing " << raja_functor.name << " integer benchmarks" << std::endl; - TimeAtomicOp(desul_functor, N, 100, 10000); - TimeAtomicOp(raja_functor, N, 100, 10000); - TimeAtomicOp(desul_functor, N, 10, 1000); - TimeAtomicOp(raja_functor, N, 10, 1000); + auto raja_functor = type_pair.second; + std::cout << INDENT << "Executing " << raja_functor.name + << " integer benchmarks" << std::endl; + TimeAtomicOp(desul_functor, N, 100, + 10000); + TimeAtomicOp(raja_functor, N, 100, + 10000); + TimeAtomicOp(desul_functor, N, 10, + 1000); + TimeAtomicOp(raja_functor, N, 10, + 1000); TimeAtomicOp(desul_functor, N, 4, 10); TimeAtomicOp(raja_functor, N, 4, 10); // Test contention over a single atomic TimeAtomicOp(desul_functor, N); TimeAtomicOp(raja_functor, N); - }); + }); } -int main (int argc, char* argv[]) { - if (argc > 2) { - RAJA_ABORT_OR_THROW("Usage: ./benchmark-atomic.exe where N is the optional size of the benchmark loop"); - } - uint64_t N = 1000000000; - if (argc == 2) { - N = std::stoll(argv[1]); - } +int main(int argc, char* argv[]) +{ + if (argc > 2) + { + RAJA_ABORT_OR_THROW("Usage: ./benchmark-atomic.exe where N is the " + "optional size of the benchmark loop"); + } + uint64_t N = 1000000000; + if (argc == 2) + { + N = std::stoll(argv[1]); + } + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + // Perform an untimed initialization of both desul and RAJA atomics. + TimeAtomicOp, int, true>( + atomicWrapperDesulBinary {}, N, 10, + 1000, false); + TimeAtomicOp, int, true>( + AtomicAdd {}, N, 10, 1000, false); + // GPU benchmarks + std::cout << "Executing GPU benchmarks" << std::endl; + ExecuteBenchmark>( + N); +#endif + +#if defined(RAJA_ENABLE_OPENMP) + // Perform an untimed initialization of both desul and RAJA atomics. + TimeAtomicOp( + AtomicAdd {}, N, 10, 1000, false); + TimeAtomicOp( + atomicWrapperDesulBinary {}, N, 10, + 1000, false); + + // OpenMP benchmarks + std::cout << "Executing OpenMP benchmarks" << std::endl; + ExecuteBenchmark(N); +#endif - #if defined (RAJA_ENABLE_CUDA) || defined (RAJA_ENABLE_HIP) - // Perform an untimed initialization of both desul and RAJA atomics. - TimeAtomicOp, int, true>(atomicWrapperDesulBinary{}, N, 10, 1000, false); - TimeAtomicOp, int, true>(AtomicAdd{}, N, 10, 1000, false); - // GPU benchmarks - std::cout << "Executing GPU benchmarks" << std::endl; - ExecuteBenchmark>(N); - #endif - - #if defined (RAJA_ENABLE_OPENMP) - // Perform an untimed initialization of both desul and RAJA atomics. - TimeAtomicOp(AtomicAdd{}, N, 10, 1000, false); - TimeAtomicOp(atomicWrapperDesulBinary{}, N, 10, 1000, false); - - // OpenMP benchmarks - std::cout << "Executing OpenMP benchmarks" << std::endl; - ExecuteBenchmark(N); - #endif - - return 0; + return 0; } diff --git a/benchmark/host-device-lambda-benchmark.cpp b/benchmark/host-device-lambda-benchmark.cpp index 33c78e2d8c..dde4f02efb 100644 --- a/benchmark/host-device-lambda-benchmark.cpp +++ b/benchmark/host-device-lambda-benchmark.cpp @@ -17,13 +17,15 @@ static void benchmark_daxpy_raw(benchmark::State& state) double* a = new double[N]; double* b = new double[N]; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { a[i] = 1.0; b[i] = 2.0; } double c = 3.14159; - while (state.KeepRunning()) { + while (state.KeepRunning()) + { RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { a[i] += b[i] * c; }); } @@ -34,13 +36,15 @@ static void benchmark_daxpy_host(benchmark::State& state) double* a = new double[N]; double* b = new double[N]; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { a[i] = 1.0; b[i] = 2.0; } double c = 3.14159; - while (state.KeepRunning()) { + while (state.KeepRunning()) + { RAJA::forall(RAJA::RangeSegment(0, N), [=] __host__(int i) { a[i] += b[i] * c; }); } @@ -51,17 +55,18 @@ static void benchmark_daxpy_host_device(benchmark::State& state) double* a = new double[N]; double* b = new double[N]; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { a[i] = 1.0; b[i] = 2.0; } double c = 3.14159; - while (state.KeepRunning()) { + while (state.KeepRunning()) + { RAJA::forall(RAJA::RangeSegment(0, N), - [=] RAJA_HOST_DEVICE(int i) { - a[i] += b[i] * c; - }); + [=] RAJA_HOST_DEVICE(int i) + { a[i] += b[i] * c; }); } } diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp index 15a3e54eb8..b464b77396 100644 --- a/benchmark/ltimes.cpp +++ b/benchmark/ltimes.cpp @@ -7,46 +7,54 @@ // Place the following line before including RAJA to enable // statistics on the Vector abstractions -//#define RAJA_ENABLE_VECTOR_STATS +// #define RAJA_ENABLE_VECTOR_STATS // Un-comment the following line to run correctness checks on each variant -//#define DEBUG_LTIMES -//#define DEBUG_MATRIX_LOAD_STORE +// #define DEBUG_LTIMES +// #define DEBUG_MATRIX_LOAD_STORE #include "RAJA/config.hpp" -#define VARIANT_C 1 -#define VARIANT_C_VIEWS 1 -#define VARIANT_RAJA_SEQ 1 -#define VARIANT_RAJA_SEQ_ARGS 1 -#define VARIANT_RAJA_TEAMS_SEQ 1 -#define VARIANT_RAJA_VECTOR 1 -#define VARIANT_RAJA_MATRIX 1 -#define VARIANT_RAJA_SEQ_SHMEM 1 +#define VARIANT_C 1 +#define VARIANT_C_VIEWS 1 +#define VARIANT_RAJA_SEQ 1 +#define VARIANT_RAJA_SEQ_ARGS 1 +#define VARIANT_RAJA_TEAMS_SEQ 1 +#define VARIANT_RAJA_VECTOR 1 +#define VARIANT_RAJA_MATRIX 1 +#define VARIANT_RAJA_SEQ_SHMEM 1 #if defined(RAJA_ENABLE_OPENMP) -#define VARIANT_RAJA_OPENMP 1 +#define VARIANT_RAJA_OPENMP 1 #endif #if defined(RAJA_ENABLE_CUDA) -#define VARIANT_CUDA_KERNEL 1 -#define VARIANT_CUDA_TEAMS 1 -#define VARIANT_CUDA_TEAMS_MATRIX 1 -#define VARIANT_CUDA_KERNEL_SHMEM 1 +#define VARIANT_CUDA_KERNEL 1 +#define VARIANT_CUDA_TEAMS 1 +#define VARIANT_CUDA_TEAMS_MATRIX 1 +#define VARIANT_CUDA_KERNEL_SHMEM 1 #endif #if defined(RAJA_ENABLE_HIP) -#define RAJA_HIP_KERNEL 1 -#define RAJA_HIP_KERNEL_SHMEM 1 +#define RAJA_HIP_KERNEL 1 +#define RAJA_HIP_KERNEL_SHMEM 1 #endif - extern "C" { - void dgemm_(char * transa, char * transb, int * m, int * n, int * k, - double * alpha, double * A, int * lda, - double * B, int * ldb, double * beta, - double *, int * ldc); +void dgemm_(char* transa, + char* transb, + int* m, + int* n, + int* k, + double* alpha, + double* A, + int* lda, + double* B, + int* ldb, + double* beta, + double*, + int* ldc); } #include @@ -94,7 +102,6 @@ extern "C" { */ - using namespace RAJA; using namespace RAJA::expt; @@ -115,43 +122,43 @@ RAJA_INDEX_VALUE_T(IZ, int, "IZ"); // Function to check results // template -void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, +void checkResult(PHIVIEW_T& phi, + LVIEW_T& L, + PSIVIEW_T& psi, const int num_m, const int num_d, const int num_g, const int num_z); - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { std::cout << "\n\nRAJA LTIMES example...\n\n"; -//----------------------------------------------------------------------------// -// Define array dimensions, allocate arrays, define Layouts and Views, etc. + //----------------------------------------------------------------------------// + // Define array dimensions, allocate arrays, define Layouts and Views, etc. // Note: rand()/RAND_MAX is always zero, but forces the compiler to not // optimize out these values as compile time constants - const int num_m = 25 + (rand()/RAND_MAX); - const int num_g = 160 + (rand()/RAND_MAX); - const int num_d = 80 + (rand()/RAND_MAX); + const int num_m = 25 + (rand() / RAND_MAX); + const int num_g = 160 + (rand() / RAND_MAX); + const int num_d = 80 + (rand() / RAND_MAX); #ifdef DEBUG_LTIMES - const int num_iter = 1 ; //+ (rand()/RAND_MAX);; + const int num_iter = 1; //+ (rand()/RAND_MAX);; // use a decreased number of zones since this will take a lot longer // and we're not really measuring performance here - const long num_z = 32 + (rand()/RAND_MAX); + const long num_z = 32 + (rand() / RAND_MAX); #else - const int num_iter = 10 + (rand()/RAND_MAX); - const int num_z = 32*657 + (rand()/RAND_MAX); + const int num_iter = 10 + (rand() / RAND_MAX); + const int num_z = 32 * 657 + (rand() / RAND_MAX); #endif + double total_flops = 2.0 * num_g * num_z * num_d * num_m * num_iter * 1000.0; - double total_flops = 2.0*num_g*num_z*num_d*num_m*num_iter*1000.0; - - std::cout << "num_m = " << num_m << ", num_g = " << num_g << - ", num_d = " << num_d << ", num_z = " << num_z << "\n\n"; + std::cout << "num_m = " << num_m << ", num_g = " << num_g + << ", num_d = " << num_d << ", num_z = " << num_z << "\n\n"; std::cout << "total flops: " << (long)total_flops << "\n"; @@ -167,1133 +174,1076 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) double* psi_data = &psi_vec[0]; double* phi_data = &phi_vec[0]; - for (int i = 0; i < L_size; ++i) { - L_data[i] = i+1; + for (int i = 0; i < L_size; ++i) + { + L_data[i] = i + 1; } - for (int i = 0; i < psi_size; ++i) { - psi_data[i] = 2*i+1; + for (int i = 0; i < psi_size; ++i) + { + psi_data[i] = 2 * i + 1; } // Note phi_data will be set to zero before each variant is run. -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_C -{ - std::cout << "\n Running baseline C-version of LTimes...\n"; + { + std::cout << "\n Running baseline C-version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // Using restrict doesn't make much of a difference for most compilers. + // Using restrict doesn't make much of a difference for most compilers. #if 1 - double * RAJA_RESTRICT L = L_data; - double * RAJA_RESTRICT psi = psi_data; - double * RAJA_RESTRICT phi = phi_data; + double* RAJA_RESTRICT L = L_data; + double* RAJA_RESTRICT psi = psi_data; + double* RAJA_RESTRICT phi = phi_data; #else - double * L = L_data; - double * psi = psi_data; - double * phi = phi_data; + double* L = L_data; + double* psi = psi_data; + double* phi = phi_data; #endif - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter) - for (int g = 0; g < num_g; ++g) { - for (int z = 0; z < num_z; ++z) { - for (int m = 0; m < num_m; ++m) { - for (int d = 0; d < num_d; ++d) { - phi[g*num_z*num_m + z*num_m + m] += - L[d*num_m + m] * psi[g*num_z*num_d + z*num_d + d]; + for (int iter = 0; iter < num_iter; ++iter) + for (int g = 0; g < num_g; ++g) + { + for (int z = 0; z < num_z; ++z) + { + for (int m = 0; m < num_m; ++m) + { + for (int d = 0; d < num_d; ++d) + { + phi[g * num_z * num_m + z * num_m + m] += + L[d * num_m + m] * psi[g * num_z * num_d + z * num_d + d]; + } + } } } - } - } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " C-version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; - -} + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " C-version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_C_VIEWS -{ - std::cout << "\n Running C-version of LTimes (with Views)...\n"; + { + std::cout << "\n Running C-version of LTimes (with Views)...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter) - for (IG g(0); g < num_g; ++g) { - for (IZ z(0); z < num_z; ++z) { - for (IM m(0); m < num_m; ++m) { - for (ID d(0); d < num_d; ++d) { - phi(m, g, z) += L(m, d) * psi(d, g, z); + for (int iter = 0; iter < num_iter; ++iter) + for (IG g(0); g < num_g; ++g) + { + for (IZ z(0); z < num_z; ++z) + { + for (IM m(0); m < num_m; ++m) + { + for (ID d(0); d < num_d; ++d) + { + phi(m, g, z) += L(m, d) * psi(d, g, z); + } + } } } - } - } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " C-version of LTimes run time (with Views) (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " C-version of LTimes run time (with Views) (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_SEQ -{ - std::cout << "\n Running RAJA sequential version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::For<2, seq_exec, // g - statement::For<3, seq_exec, // z - statement::For<0, seq_exec, // m - statement::For<1, simd_exec, // d - statement::Lambda<0> - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA sequential version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA sequential version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + using EXECPOL = RAJA::KernelPolicy>>>>>; + + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); + + RAJA::Timer timer; + timer.start(); + + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=](IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); + + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA sequential version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_SEQ_ARGS -{ - std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::For<2, seq_exec, // g - statement::For<3, seq_exec, // z - statement::For<0, seq_exec, // m - statement::For<1, simd_exec, // d - statement::Lambda<0, Segs<0, 1, 2, 3>> - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA sequential ARGS version of LTimes...\n"; - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA sequential ARGS version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + std::memset(phi_data, 0, phi_size * sizeof(double)); + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; -#if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); -#endif -} -#endif + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; -//----------------------------------------------------------------------------// + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; -#if VARIANT_RAJA_TEAMS_SEQ -{ - std::cout << "\n Running RAJA Teams sequential version of LTimes...\n"; + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + using EXECPOL = RAJA::KernelPolicy>>>>>>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); + RAJA::Timer timer; + timer.start(); - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=](IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); - std::array psi_perm {{2, 1, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA sequential ARGS version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; - std::array phi_perm {{2, 1, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); +#if defined(DEBUG_LTIMES) + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); +#endif + } +#endif - using pol_launch = RAJA::LaunchPolicy; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + //----------------------------------------------------------------------------// +#if VARIANT_RAJA_TEAMS_SEQ + { + std::cout << "\n Running RAJA Teams sequential version of LTimes...\n"; + std::memset(phi_data, 0, phi_size * sizeof(double)); - RAJA::Timer timer; - timer.start(); + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::launch(RAJA::ExecPlace::HOST, RAJA::LaunchParams(), [=](RAJA::LaunchContext ctx){ + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_z), [&](IZ z){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_m), [&](IM m){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_d), [&](ID d){ - phi(m, g, z) += L(m, d) * psi(d, g, z); - }); - }); - }); - }); + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - }); // laucnch - } // iter - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA Teams sequential version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array psi_perm {{2, 1, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); -#if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); -#endif + std::array phi_perm {{2, 1, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); -} + using pol_launch = RAJA::LaunchPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; + + + RAJA::Timer timer; + timer.start(); + + for (int iter = 0; iter < num_iter; ++iter) + { + RAJA::launch( + RAJA::ExecPlace::HOST, RAJA::LaunchParams(), + [=](RAJA::LaunchContext ctx) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_g), + [&](IG g) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_z), + [&](IZ z) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_m), + [&](IM m) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_d), + [&](ID d) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); + }); + }); + }); + }); // laucnch + } // iter + + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA Teams sequential version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; + + +#if defined(DEBUG_LTIMES) + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); +#endif + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_VECTOR -{ - std::cout << "\n Running RAJA vectorized version of LTimes...\n"; + { + std::cout << "\n Running RAJA vectorized version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 0, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 0, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 0, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 0, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using vector_t = RAJA::expt::VectorRegister; - using VecIZ = RAJA::expt::VectorIndex; + using vector_t = RAJA::expt::VectorRegister; + using VecIZ = RAJA::expt::VectorIndex; - using EXECPOL = - RAJA::KernelPolicy< - statement::For<2, seq_exec, // g - statement::For<0, seq_exec, // m - statement::For<1, seq_exec, // d + using EXECPOL = RAJA::KernelPolicy< + statement::For<2, seq_exec, // g + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - > - >; + statement::Lambda<0>>>>>; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::expt::tensor_stats::resetVectorStats(); + RAJA::expt::tensor_stats::resetVectorStats(); #endif - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g)); + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g)); - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - auto all_z = VecIZ::all(); + auto all_z = VecIZ::all(); - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g) { - phi(m, g, all_z) += L(m, d) * psi(d, g, all_z); - } - ); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, + [=](IM m, ID d, IG g) { + phi(m, g, all_z) += L(m, d) * psi(d, g, all_z); + }); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA vectorized version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA vectorized version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::printVectorStats(); + RAJA::tensor_stats::printVectorStats(); #endif #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_MATRIX -{ - std::cout << "\n Running RAJA column-major matrix version of LTimes...\n"; + { + std::cout << "\n Running RAJA column-major matrix version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 2, 0}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 2, 0}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 2, 0}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 2, 0}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::expt::SquareMatrixRegister; - //using matrix_t = RAJA::expt::SquareMatrixRegister; -// using matrix_t = RAJA::expt::RectMatrixRegister; + using matrix_t = + RAJA::expt::SquareMatrixRegister; + // using matrix_t = RAJA::expt::SquareMatrixRegister; + // using matrix_t = RAJA::expt::RectMatrixRegister; + std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x" + << matrix_t::s_dim_elem(1) << std::endl; - std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << - "x" << matrix_t::s_dim_elem(1) << std::endl; + printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers); - printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers); - - using RowM = RAJA::expt::RowIndex; - using ColD = RAJA::expt::ColIndex; - using ColZ = RAJA::expt::ColIndex; + using RowM = RAJA::expt::RowIndex; + using ColD = RAJA::expt::ColIndex; + using ColZ = RAJA::expt::ColIndex; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::resetVectorStats(); + RAJA::tensor_stats::resetVectorStats(); #endif - RAJA::Timer timer; - timer.start(); - + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), - [=](IG g) + for (int iter = 0; iter < num_iter; ++iter) { - auto rows_m = RowM::all(); - auto cols_z = ColZ::all(); - auto cols_d = ColD::all(); - auto rows_d = toRowIndex(cols_d); - - phi(rows_m, g, cols_z) += - L(rows_m, cols_d) * psi(rows_d, g, cols_z); - -// phi(rows_m, g, cols_z) = (L(rows_m, cols_d) * psi(rows_d, g, cols_z)) * (L(rows_m, cols_d) * psi(rows_d, g, cols_z)); - - }); - - - - } + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + [=](IG g) + { + auto rows_m = RowM::all(); + auto cols_z = ColZ::all(); + auto cols_d = ColD::all(); + auto rows_d = toRowIndex(cols_d); + + phi(rows_m, g, cols_z) += + L(rows_m, cols_d) * psi(rows_d, g, cols_z); + + // phi(rows_m, g, cols_z) = (L(rows_m, + // cols_d) * psi(rows_d, g, cols_z)) * + // (L(rows_m, cols_d) * psi(rows_d, g, + // cols_z)); + }); + } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA column-major matrix version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout + << " RAJA column-major matrix version of LTimes run time (sec.): " << t + << ", GFLOPS/sec: " << gflop_rate << std::endl; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::printVectorStats(); + RAJA::tensor_stats::printVectorStats(); #endif #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - - -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_RAJA_MATRIX -{ - std::cout << "\n Running RAJA row-major matrix version of LTimes...\n"; + { + std::cout << "\n Running RAJA row-major matrix version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 0, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 0, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 0, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 0, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::expt::SquareMatrixRegister; + using matrix_t = RAJA::expt::SquareMatrixRegister; - std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << - "x" << matrix_t::s_dim_elem(1) << std::endl; + std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x" + << matrix_t::s_dim_elem(1) << std::endl; using RowM = RAJA::expt::RowIndex; using ColD = RAJA::expt::ColIndex; using ColZ = RAJA::expt::ColIndex; - #ifdef RAJA_ENABLE_VECTOR_STATS +#ifdef RAJA_ENABLE_VECTOR_STATS RAJA::expt::tensor_stats::resetVectorStats(); - #endif +#endif RAJA::Timer timer; timer.start(); - for (int iter = 0;iter < num_iter;++ iter){ - - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), - [=](IG g) - { - - auto rows_m = RowM::all(); - auto cols_z = ColZ::all(); - auto cols_d = ColD::all(); - auto rows_d = toRowIndex(cols_d); - - phi(rows_m, g, cols_z) += - L(rows_m, cols_d) * psi(rows_d, g, cols_z); - - }); - - + for (int iter = 0; iter < num_iter; ++iter) + { + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + [=](IG g) + { + auto rows_m = RowM::all(); + auto cols_z = ColZ::all(); + auto cols_d = ColD::all(); + auto rows_d = toRowIndex(cols_d); + + phi(rows_m, g, cols_z) += + L(rows_m, cols_d) * psi(rows_d, g, cols_z); + }); } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA row-major matrix version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA row-major matrix version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::printVectorStats(); + RAJA::tensor_stats::printVectorStats(); #endif #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - - -} + } #endif //----------------------------------------------------------------------------// #if VARIANT_RAJA_SEQ_SHMEM -{ - std::cout << "\n Running RAJA sequential shmem version of LTimes...\n"; + { + std::cout << "\n Running RAJA sequential shmem version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + std::memset(phi_data, 0, phi_size * sizeof(double)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{0, 1, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{0, 1, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{0, 1, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{0, 1, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - constexpr size_t tile_m = 25; - constexpr size_t tile_d = 80; - constexpr size_t tile_z = 256; - constexpr size_t tile_g = 0; + constexpr size_t tile_m = 25; + constexpr size_t tile_d = 80; + constexpr size_t tile_z = 256; + constexpr size_t tile_g = 0; - using RAJA::statement::Param; + using RAJA::statement::Param; - using EXECPOL = - RAJA::KernelPolicy< + using EXECPOL = RAJA::KernelPolicy< - // Create memory tiles - statement::InitLocalMem, + // Create memory tiles + statement::InitLocalMem< + RAJA::cpu_tile_mem, RAJA::ParamList<0, 1, 2>, - // Tile outer m,d loops - statement::Tile<0, tile_fixed, seq_exec, // m - statement::Tile<1, tile_fixed, seq_exec, // d + // Tile outer m,d loops + statement::Tile< + 0, tile_fixed, seq_exec, // m + statement::Tile< + 1, tile_fixed, seq_exec, // d - // Load L(m,d) for m,d tile into shmem - statement::For<0, seq_exec, // m - statement::For<1, seq_exec, // d - statement::Lambda<0, Segs<0, 1>, - Params<0>, - Offsets<0, 1>> - > - >, + // Load L(m,d) for m,d tile into shmem + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d + statement::Lambda< + 0, Segs<0, 1>, Params<0>, + Offsets<0, 1>>>>, - // Run inner g, z loops with z loop tiled - statement::For<2, seq_exec, // g - statement::Tile<3, tile_fixed, seq_exec, // z + // Run inner g, z loops with z loop tiled + statement::For< + 2, seq_exec, // g + statement::Tile<3, tile_fixed, seq_exec, // z - // Load psi into shmem - statement::For<1, seq_exec, // d - statement::For<3, seq_exec, // z - statement::Lambda<1, Segs<1, 2, 3>, - Params<1>, - Offsets<1, 2, 3>> - > - >, + // Load psi into shmem + statement::For< + 1, seq_exec, // d + statement::For< + 3, seq_exec, // z + statement::Lambda< + 1, Segs<1, 2, 3>, Params<1>, + Offsets<1, 2, 3>>>>, - // Compute phi - statement::For<0, seq_exec, // m + // Compute phi + statement::For< + 0, seq_exec, // m - // Load phi into shmem - statement::For<3, seq_exec, // z - statement::Lambda<2, Segs<0, 2, 3>, - Params<2>, - Offsets<0, 2, 3>> - >, + // Load phi into shmem + statement::For< + 3, seq_exec, // z + statement::Lambda< + 2, Segs<0, 2, 3>, Params<2>, + Offsets<0, 2, 3>>>, - // Compute phi in shmem - statement::For<1, seq_exec, // d - statement::For<3, seq_exec, // z - statement::Lambda<3, Params<0, 1, 2>, - Offsets<0, 1, 2, 3>> - > - >, + // Compute phi in shmem + statement::For< + 1, seq_exec, // d + statement::For< + 3, seq_exec, // z + statement::Lambda< + 3, Params<0, 1, 2>, + Offsets<0, 1, 2, 3>>>>, - // Store phi - statement:: For<3, seq_exec, // z - statement::Lambda<4, Segs<0, 2, 3>, - Params<2>, - Offsets<0, 2, 3>> - > - > // m + // Store phi + statement::For< + 3, seq_exec, // z + statement::Lambda< + 4, Segs<0, 2, 3>, Params<2>, + Offsets<0, 2, 3>>>> // m - > // Tile z - > // g + > // Tile z + > // g - > // Tile d - > // Tile m - > // LocalMemory - >; // KernelPolicy + > // Tile d + > // Tile m + > // LocalMemory + >; // KernelPolicy + // + // Define statically dimensioned local arrays used in kernel + // - // - // Define statically dimensioned local arrays used in kernel - // + using shmem_L_t = + RAJA::TypedLocalArray, IM, ID>; + shmem_L_t shmem_L; - using shmem_L_t = RAJA::TypedLocalArray, - IM, ID>; - shmem_L_t shmem_L; + using shmem_psi_t = + RAJA::TypedLocalArray, ID, IG, + IZ>; + shmem_psi_t shmem_psi; - using shmem_psi_t = RAJA::TypedLocalArray, - ID, IG, IZ>; - shmem_psi_t shmem_psi; + using shmem_phi_t = + RAJA::TypedLocalArray, IM, IG, + IZ>; + shmem_phi_t shmem_phi; - using shmem_phi_t = RAJA::TypedLocalArray, - IM, IG, IZ>; - shmem_phi_t shmem_phi; + RAJA::Timer timer; + timer.start(); - RAJA::Timer timer; - timer.start(); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel_param( - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)), + // For kernel_param, second arg is a tuple of data objects used in + // lambdas. They are the last args in all lambdas (after indices). + RAJA::make_tuple(shmem_L, shmem_psi, shmem_phi), - RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)), - // For kernel_param, second arg is a tuple of data objects used in lambdas. - // They are the last args in all lambdas (after indices). - RAJA::make_tuple( shmem_L, - shmem_psi, - shmem_phi), - - - // Lambda<0> : Load L into shmem - [=] (IM m, ID d, - shmem_L_t& sh_L, - IM tm, ID td) - { - sh_L(tm, td) = L(m, d); - }, - // Lambda<1> : Load psi into shmem - [=] (ID d, IG g, IZ z, - shmem_psi_t& sh_psi, - ID td, IG tg, IZ tz) - { - sh_psi(td, tg, tz) = psi(d, g, z); - }, + // Lambda<0> : Load L into shmem + [=](IM m, ID d, shmem_L_t& sh_L, IM tm, ID td) + { sh_L(tm, td) = L(m, d); }, - // Lambda<2> : Load phi into shmem - [=] (IM m, IG g, IZ z, - shmem_phi_t& sh_phi, - IM tm, IG tg, IZ tz) - { - sh_phi(tm, tg, tz) = phi(m, g, z); - }, + // Lambda<1> : Load psi into shmem + [=](ID d, IG g, IZ z, shmem_psi_t& sh_psi, ID td, IG tg, IZ tz) + { sh_psi(td, tg, tz) = psi(d, g, z); }, - // Lambda<3> : Compute phi in shmem - [=] (shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi, - IM tm, ID td, IG tg, IZ tz) - { - sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz); - }, + // Lambda<2> : Load phi into shmem + [=](IM m, IG g, IZ z, shmem_phi_t& sh_phi, IM tm, IG tg, IZ tz) + { sh_phi(tm, tg, tz) = phi(m, g, z); }, - // Lambda<4> : Store phi - [=] (IM m, IG g, IZ z, - shmem_phi_t& sh_phi, - IM tm, IG tg, IZ tz) - { - phi(m, g, z) = sh_phi(tm, tg, tz); - } + // Lambda<3> : Compute phi in shmem + [=](shmem_L_t& sh_L, shmem_psi_t& sh_psi, shmem_phi_t& sh_phi, IM tm, + ID td, IG tg, IZ tz) + { sh_phi(tm, tg, tz) += sh_L(tm, td) * sh_psi(td, tg, tz); }, - ); + // Lambda<4> : Store phi + [=](IM m, IG g, IZ z, shmem_phi_t& sh_phi, IM tm, IG tg, IZ tz) + { phi(m, g, z) = sh_phi(tm, tg, tz); } - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA sequential shmem version of LTimes run time (sec.): " - << t <<", GFLOPS/sec: " << gflop_rate << std::endl; + ); + + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA sequential shmem version of LTimes run time (sec.): " + << t << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) && (VARIANT_RAJA_OPENMP) -{ - std::cout << "\n Running RAJA OpenMP version of LTimes...\n"; + { + std::cout << "\n Running RAJA OpenMP version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(L_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(L_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{0, 1, 2}}; - PsiView psi(psi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{0, 1, 2}}; + PsiView psi(psi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{0, 1, 2}}; - PhiView phi(phi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{0, 1, 2}}; + PhiView phi(phi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); #if 1 - using EXECPOL = - RAJA::KernelPolicy< - statement::For<0, omp_parallel_for_exec, // m - statement::For<1, seq_exec, // d - statement::For<2, seq_exec, // g - statement::For<3, simd_exec, // z - statement::Lambda<0> - > - > - > - > - >; + using EXECPOL = RAJA::KernelPolicy>>>>>; #else - // - // Benefits of using OpenMP collapse depends on compiler, platform, - // relative segment sizes. - // - using EXECPOL = - RAJA::KernelPolicy< - statement::Collapse, // m, g, z - statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - >; + // + // Benefits of using OpenMP collapse depends on compiler, platform, + // relative segment sizes. + // + using EXECPOL = RAJA::KernelPolicy, // m, g, z + statement::For<1, seq_exec, // d + statement::Lambda<0>>>>; #endif - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); - RAJA::Timer timer; - timer.start(); + RAJA::Timer timer; + timer.start(); - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=](IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA OpenMP version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA OpenMP version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_CUDA_KERNEL -{ - std::cout << "\n Running RAJA CUDA version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::CudaKernelAsync< - statement::For<0, cuda_block_x_loop, // m - statement::For<2, cuda_block_y_loop, // g - statement::For<3, cuda_thread_x_loop, // z - statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA CUDA version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + using EXECPOL = + RAJA::KernelPolicy>>>>>>; + + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); + + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - cudaErrchk( cudaDeviceSynchronize() ); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); + cudaErrchk(cudaDeviceSynchronize()); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); + + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif - -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_CUDA_TEAMS -{ - std::cout << "\n Running RAJA CUDA Teams version of LTimes...\n"; + { + std::cout << "\n Running RAJA CUDA Teams version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); - using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + using pol_launch = + RAJA::LaunchPolicy>; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::launch( - RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(160, 1, 1), - RAJA::Threads(8, 64, 1)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) + for (int iter = 0; iter < num_iter; ++iter) { - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_z), [&](IZ z){ - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_m), [&](IM m){ - - double acc = phi(m, g, z); - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_d), [&](ID d){ - - acc += L(m, d) * psi(d, g, z); - - - }); - - phi(m,g,z) = acc; + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(160, 1, 1), RAJA::Threads(8, 64, 1)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_g), + [&](IG g) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_z), + [&](IZ z) + { + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_m), + [&](IM m) + { + double acc = phi(m, g, z); + + RAJA::loop( + ctx, RAJA::TypedRangeSegment(0, num_d), + [&](ID d) + { + acc += L(m, d) * psi(d, g, z); + }); + + phi(m, g, z) = acc; + }); + }); + }); }); - }); - }); - - }); - - } - cudaErrchk( cudaDeviceSynchronize() ); + } + cudaErrchk(cudaDeviceSynchronize()); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA Teams version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA Teams version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #ifdef __CUDA_ARCH__ #define RAJA_GET_POLICY(POL) typename POL::device_policy_t @@ -1303,732 +1253,711 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if VARIANT_CUDA_TEAMS_MATRIX -{ - std::cout << "\n Running RAJA CUDA Teams+Matrix version of LTimes...\n"; + { + std::cout << "\n Running RAJA CUDA Teams+Matrix version of LTimes...\n"; - std::memset(phi_data, 0, phi_size * sizeof(double)); + std::memset(phi_data, 0, phi_size * sizeof(double)); - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); - using matrix_layout = RowMajorLayout; + using matrix_layout = RowMajorLayout; - using L_matrix_host_t = RAJA::expt::SquareMatrixRegister; - using L_matrix_device_t = RAJA::expt::RectMatrixRegister; - using L_matrix_hd_t = RAJA::LaunchPolicy; + using L_matrix_host_t = + RAJA::expt::SquareMatrixRegister; + using L_matrix_device_t = + RAJA::expt::RectMatrixRegister; + using L_matrix_hd_t = + RAJA::LaunchPolicy; - using phi_matrix_host_t = RAJA::expt::SquareMatrixRegister; - using phi_matrix_device_t = RAJA::expt::RectMatrixRegister; - using phi_matrix_hd_t = RAJA::LaunchPolicy; + using phi_matrix_host_t = + RAJA::expt::SquareMatrixRegister; + using phi_matrix_device_t = + RAJA::expt::RectMatrixRegister; + using phi_matrix_hd_t = + RAJA::LaunchPolicy; - using psi_matrix_host_t = RAJA::expt::SquareMatrixRegister; - using psi_matrix_device_t = RAJA::expt::RectMatrixRegister; - using psi_matrix_hd_t = RAJA::LaunchPolicy; + using psi_matrix_host_t = + RAJA::expt::SquareMatrixRegister; + using psi_matrix_device_t = + RAJA::expt::RectMatrixRegister; + using psi_matrix_hd_t = + RAJA::LaunchPolicy; - using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; + using pol_launch = + RAJA::LaunchPolicy>; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; - std::array L_perm {{1, 0}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + std::array L_perm {{1, 0}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - std::array psi_perm {{1, 2, 0}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + std::array psi_perm {{1, 2, 0}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - std::array phi_perm {{1, 2, 0}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + std::array phi_perm {{1, 2, 0}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - auto seg_g = RAJA::TypedRangeSegment(0, num_g); - auto seg_z = RAJA::TypedRangeSegment(0, num_z); - auto seg_m = RAJA::TypedRangeSegment(0, num_m); - auto seg_d = RAJA::TypedRangeSegment(0, num_d); + auto seg_g = RAJA::TypedRangeSegment(0, num_g); + auto seg_z = RAJA::TypedRangeSegment(0, num_z); + auto seg_m = RAJA::TypedRangeSegment(0, num_m); + auto seg_d = RAJA::TypedRangeSegment(0, num_d); - printf("num_iter=%d\n", (int)num_iter); - for (int iter = 0;iter < num_iter;++ iter){ - RAJA::launch( - RAJA::ExecPlace::DEVICE, - RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1), - RAJA::Threads(32, 32, 1)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) + printf("num_iter=%d\n", (int)num_iter); + for (int iter = 0; iter < num_iter; ++iter) { + RAJA::launch( + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1), + RAJA::Threads(32, 32, 1)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) + { + using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t); + using L_RowM = RAJA::expt::RowIndex; + using L_ColD = RAJA::expt::ColIndex; + + using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t); + using psi_RowD = RAJA::expt::RowIndex; + using psi_ColZ = RAJA::expt::ColIndex; + + using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t); + using phi_RowM = RAJA::expt::RowIndex; + using phi_ColZ = RAJA::expt::ColIndex; + + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), + [&](IG g) + { + RAJA::tile( + ctx, 32, + RAJA::TypedRangeSegment(0, num_z), + [&](RAJA::TypedRangeSegment tzi) + { + RAJA::TypedRangeSegment tz( + *tzi.begin(), *tzi.end()); + + phi(phi_RowM::all(), g, phi_ColZ(tz)) += + L(L_RowM::all(), L_ColD::all()) * + psi(psi_RowD::all(), g, psi_ColZ(tz)); + }); + }); + }); + } + cudaErrchk(cudaDeviceSynchronize()); - - using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t); - using L_RowM = RAJA::expt::RowIndex; - using L_ColD = RAJA::expt::ColIndex; - - using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t); - using psi_RowD = RAJA::expt::RowIndex; - using psi_ColZ = RAJA::expt::ColIndex; - - using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t); - using phi_RowM = RAJA::expt::RowIndex; - using phi_ColZ = RAJA::expt::ColIndex; - - - RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ - - RAJA::tile(ctx, 32, RAJA::TypedRangeSegment(0, num_z), [&](RAJA::TypedRangeSegment tzi){ - - RAJA::TypedRangeSegment tz(*tzi.begin(), *tzi.end()); - - phi(phi_RowM::all(), g, phi_ColZ(tz)) += - L(L_RowM::all(), L_ColD::all()) * psi(psi_RowD::all(), g, psi_ColZ(tz)); - - }); - }); - - }); - - } - cudaErrchk( cudaDeviceSynchronize() ); - - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA Teams+Matrix version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA Teams+Matrix version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if VARIANT_CUDA_KERNEL_SHMEM -{ - std::cout << "\n Running RAJA CUDA + shmem version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - cudaErrchk( cudaMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dL_data, L_data, L_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); - cudaErrchk( cudaMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - cudaErrchk( cudaMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - cudaMemcpyHostToDevice ) ); + { + std::cout << "\n Running RAJA CUDA + shmem version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + cudaErrchk(cudaMalloc((void**)&dL_data, L_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dL_data, L_data, L_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + cudaMemcpyHostToDevice)); + cudaErrchk(cudaMalloc((void**)&dphi_data, phi_size * sizeof(double))); + cudaErrchk(cudaMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + cudaMemcpyHostToDevice)); + + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + + static const int tile_m = 25; + static const int tile_d = 90; + static const int tile_g = 0; + static const int tile_z = 40; + + + // + // Define statically dimensioned local arrays used in kernel + // + + using shmem_L_t = + RAJA::TypedLocalArray, IM, ID>; + shmem_L_t shmem_L; + + + using shmem_psi_t = + RAJA::TypedLocalArray, ID, IG, + IZ>; + shmem_psi_t shmem_psi; + + + // + // Define our execution policy + // + + using RAJA::Offsets; + using RAJA::Params; + using RAJA::Segs; + + using EXECPOL = + RAJA::KernelPolicy, + // Tile outer m,d loops + statement::Tile< + 0, tile_fixed, seq_exec, // m + statement::Tile< + 1, tile_fixed, seq_exec, // d + + // Load L for m,d tile into shmem + statement::For<1, cuda_thread_x_loop, // d + statement::For<0, cuda_thread_y_direct, // m + statement::Lambda< + 0, Segs<0, 1>, Params<0>, + Offsets<0, 1>>>>, + statement::CudaSyncThreads, + + // Distribute g, z across blocks and tile z + statement::For< + 2, cuda_block_y_loop, // g + statement::Tile< + 3, tile_fixed, + cuda_block_x_loop, // z + + // Load phi into thread local storage + statement::For< + 3, cuda_thread_x_direct, // z + statement::For< + 0, cuda_thread_y_direct, // m + statement::Lambda<2, Segs<0, 2, 3>, + Params<2>>>>, + + // Load slice of psi into shmem + statement::For< + 3, cuda_thread_x_direct, // z + statement::For<1, cuda_thread_y_loop, // d + // (reusing + // y) + statement::Lambda< + 1, Segs<1, 2, 3>, Params<1>, + Offsets<1, 2, 3>>>>, + statement::CudaSyncThreads, + + // Compute phi + statement::For< + 3, cuda_thread_x_direct, // z + statement::For< + 0, cuda_thread_y_direct, // m + + // Compute thread-local Phi value and store + statement::For< + 1, seq_exec, // d + statement::Lambda< + 3, Segs<0, 1, 2, 3>, + Params<0, 1, 2>, + Offsets<0, 1, 2, 3>>> // d + > // m + >, // z + + // finish tile over directions + statement::CudaSyncThreads, + + // Write out phi from thread local storage + statement::For< + 3, cuda_thread_x_direct, // z + statement::For< + 0, cuda_thread_y_direct, // m + statement::Lambda<4, Segs<0, 2, 3>, + Params<2>>>>, + statement::CudaSyncThreads + + > // Tile z + > // g + + > // Tile d + > // Tile m + > // init shmem + > // CudaKernelAsync + + >; // KernelPolicy - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - - static const int tile_m = 25; - static const int tile_d = 90; - static const int tile_g = 0; - static const int tile_z = 40; - - - - - // - // Define statically dimensioned local arrays used in kernel - // - - using shmem_L_t = RAJA::TypedLocalArray, - IM, ID>; - shmem_L_t shmem_L; - - - using shmem_psi_t = RAJA::TypedLocalArray, - ID, IG, IZ>; - shmem_psi_t shmem_psi; - - - - // - // Define our execution policy - // - - using RAJA::Segs; - using RAJA::Params; - using RAJA::Offsets; - - using EXECPOL = - RAJA::KernelPolicy< - statement::CudaKernelAsync< - statement::InitLocalMem, - // Tile outer m,d loops - statement::Tile<0, tile_fixed, seq_exec, // m - statement::Tile<1, tile_fixed, seq_exec, // d - - // Load L for m,d tile into shmem - statement::For<1, cuda_thread_x_loop, // d - statement::For<0, cuda_thread_y_direct, // m - statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> - > - >, - statement::CudaSyncThreads, - - // Distribute g, z across blocks and tile z - statement::For<2, cuda_block_y_loop, // g - statement::Tile<3, tile_fixed, cuda_block_x_loop, // z - - // Load phi into thread local storage - statement::For<3, cuda_thread_x_direct, // z - statement::For<0, cuda_thread_y_direct, // m - statement::Lambda<2, Segs<0,2,3>, Params<2>> - > - >, - - // Load slice of psi into shmem - statement::For<3,cuda_thread_x_direct, // z - statement::For<1, cuda_thread_y_loop, // d (reusing y) - statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>> - > - >, - statement::CudaSyncThreads, - - // Compute phi - statement::For<3, cuda_thread_x_direct, // z - statement::For<0, cuda_thread_y_direct, // m - - // Compute thread-local Phi value and store - statement::For<1, seq_exec, // d - statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>> - > // d - > // m - >, // z - - // finish tile over directions - statement::CudaSyncThreads, - - // Write out phi from thread local storage - statement::For<3, cuda_thread_x_direct, // z - statement::For<0, cuda_thread_y_direct, // m - statement::Lambda<4, Segs<0,2,3>, Params<2>> - > - >, - statement::CudaSyncThreads + RAJA::Timer timer; + cudaErrchk(cudaDeviceSynchronize()); + timer.start(); - > // Tile z - > // g + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)), - > // Tile d - > // Tile m - > // init shmem - > // CudaKernelAsync - - >; // KernelPolicy + // For kernel_param, second arg is a tuple of data objects used in + // lambdas. They are the last args in all lambdas (after indices). + // Here, the last entry '0.0' yields a thread-private temporary for + // computing a phi value, for shared memory before writing to phi + // array. + RAJA::make_tuple(shmem_L, shmem_psi, 0.0), + // Lambda<0> : Load L into shmem + [=] RAJA_DEVICE(IM m, ID d, shmem_L_t & sh_L, IM tm, ID td) + { sh_L(tm, td) = L(m, d); }, + // Lambda<1> : Load slice of psi into shmem + [=] RAJA_DEVICE(ID d, IG g, IZ z, shmem_psi_t & sh_psi, ID td, IG tg, + IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); }, + // Lambda<2> : Load thread-local phi value + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi_local = phi(m, g, z); }, + // Lambda<3> Compute thread-local phi value + [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z, shmem_L_t & sh_L, + shmem_psi_t & sh_psi, double& phi_local, IM tm, ID td, + IG tg, IZ tz) + { phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); }, - RAJA::Timer timer; - cudaErrchk( cudaDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel_param( - RAJA::make_tuple( - RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)), - - // For kernel_param, second arg is a tuple of data objects used in lambdas. - // They are the last args in all lambdas (after indices). - // Here, the last entry '0.0' yields a thread-private temporary for - // computing a phi value, for shared memory before writing to phi array. - RAJA::make_tuple( shmem_L, - shmem_psi, - 0.0), - - // Lambda<0> : Load L into shmem - [=] RAJA_DEVICE (IM m, ID d, - shmem_L_t& sh_L, - IM tm, ID td) { - sh_L(tm, td) = L(m, d); - }, - - // Lambda<1> : Load slice of psi into shmem - [=] RAJA_DEVICE (ID d, IG g, IZ z, - shmem_psi_t& sh_psi, - ID td, IG tg, IZ tz) { - - sh_psi(td, tg, tz) = psi(d, g, z); - }, - - // Lambda<2> : Load thread-local phi value - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi_local = phi(m, g, z); - }, - - // Lambda<3> Compute thread-local phi value - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z, - shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local, - IM tm, ID td, IG tg, IZ tz) { - - phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); - }, - - // Lambda<4> : Store phi - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi(m, g, z) = phi_local; - } - - ); + // Lambda<4> : Store phi + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi(m, g, z) = phi_local; } - cudaDeviceSynchronize(); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA CUDA + shmem version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + ); + cudaDeviceSynchronize(); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA CUDA + shmem version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - cudaErrchk( cudaMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - cudaMemcpyDeviceToHost ) ); + cudaErrchk(cudaMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + cudaMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - cudaErrchk( cudaFree( dL_data ) ); - cudaErrchk( cudaFree( dpsi_data ) ); - cudaErrchk( cudaFree( dphi_data ) ); -} + cudaErrchk(cudaFree(dL_data)); + cudaErrchk(cudaFree(dpsi_data)); + cudaErrchk(cudaFree(dphi_data)); + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if RAJA_HIP_KERNEL -{ - std::cout << "\n Running RAJA HIP version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - hipErrchk( hipMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dL_data, L_data, L_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - using EXECPOL = - RAJA::KernelPolicy< - statement::HipKernelAsync< - statement::For<0, hip_block_x_loop, // m - statement::For<2, hip_block_y_loop, // g - statement::For<3, hip_thread_x_loop, // z - statement::For<1, seq_exec, // d - statement::Lambda<0> - > - > - > - > - > - >; - - auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)); - - RAJA::Timer timer; - hipErrchk( hipDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel( segments, - [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z) { - phi(m, g, z) += L(m, d) * psi(d, g, z); - } - ); + { + std::cout << "\n Running RAJA HIP version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + hipErrchk(hipMalloc((void**)&dL_data, L_size * sizeof(double))); + hipErrchk(hipMemcpy(dL_data, L_data, L_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + hipErrchk(hipMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dphi_data, phi_size * sizeof(double))); + hipErrchk(hipMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + hipMemcpyHostToDevice)); + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + using EXECPOL = RAJA::KernelPolicy>>>>>>; + + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)); - hipErrchk( hipDeviceSynchronize() ); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA HIP version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + RAJA::Timer timer; + hipErrchk(hipDeviceSynchronize()); + timer.start(); + + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel(segments, [=] RAJA_DEVICE(IM m, ID d, IG g, IZ z) + { phi(m, g, z) += L(m, d) * psi(d, g, z); }); - hipErrchk( hipMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - hipMemcpyDeviceToHost ) ); + hipErrchk(hipDeviceSynchronize()); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA HIP version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; - hipErrchk( hipFree( dL_data ) ); - hipErrchk( hipFree( dpsi_data ) ); - hipErrchk( hipFree( dphi_data ) ); + hipErrchk(hipMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + hipMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + hipErrchk(hipFree(dL_data)); + hipErrchk(hipFree(dpsi_data)); + hipErrchk(hipFree(dphi_data)); + + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif -} + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// #if RAJA_HIP_KERNEL_SHMEM -{ - std::cout << "\n Running RAJA HIP + shmem version of LTimes...\n"; - - std::memset(phi_data, 0, phi_size * sizeof(double)); - - double* dL_data = nullptr; - double* dpsi_data = nullptr; - double* dphi_data = nullptr; - - hipErrchk( hipMalloc( (void**)&dL_data, L_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dL_data, L_data, L_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dpsi_data, psi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dpsi_data, psi_data, psi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - hipErrchk( hipMalloc( (void**)&dphi_data, phi_size * sizeof(double) ) ); - hipErrchk( hipMemcpy( dphi_data, phi_data, phi_size * sizeof(double), - hipMemcpyHostToDevice ) ); - - - // - // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension - using LView = TypedView, IM, ID>; - - // psi(d, g, z) : 2 -> z is stride-1 dimension - using PsiView = TypedView, ID, IG, IZ>; - - // phi(m, g, z) : 2 -> z is stride-1 dimension - using PhiView = TypedView, IM, IG, IZ>; - - std::array L_perm {{0, 1}}; - LView L(dL_data, - RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); - - std::array psi_perm {{0, 1, 2}}; - PsiView psi(dpsi_data, - RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); - - std::array phi_perm {{0, 1, 2}}; - PhiView phi(dphi_data, - RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - - - static const int tile_m = 25; - static const int tile_d = 90; - static const int tile_g = 0; - static const int tile_z = 40; - - - + { + std::cout << "\n Running RAJA HIP + shmem version of LTimes...\n"; + + std::memset(phi_data, 0, phi_size * sizeof(double)); + + double* dL_data = nullptr; + double* dpsi_data = nullptr; + double* dphi_data = nullptr; + + hipErrchk(hipMalloc((void**)&dL_data, L_size * sizeof(double))); + hipErrchk(hipMemcpy(dL_data, L_data, L_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dpsi_data, psi_size * sizeof(double))); + hipErrchk(hipMemcpy(dpsi_data, psi_data, psi_size * sizeof(double), + hipMemcpyHostToDevice)); + hipErrchk(hipMalloc((void**)&dphi_data, phi_size * sizeof(double))); + hipErrchk(hipMemcpy(dphi_data, phi_data, phi_size * sizeof(double), + hipMemcpyHostToDevice)); + + + // + // View types and Views/Layouts for indexing into arrays + // + // L(m, d) : 1 -> d is stride-1 dimension + using LView = TypedView, IM, ID>; + + // psi(d, g, z) : 2 -> z is stride-1 dimension + using PsiView = TypedView, ID, IG, IZ>; + + // phi(m, g, z) : 2 -> z is stride-1 dimension + using PhiView = TypedView, IM, IG, IZ>; + + std::array L_perm {{0, 1}}; + LView L(dL_data, RAJA::make_permuted_layout({{num_m, num_d}}, L_perm)); + + std::array psi_perm {{0, 1, 2}}; + PsiView psi(dpsi_data, + RAJA::make_permuted_layout({{num_d, num_g, num_z}}, psi_perm)); + + std::array phi_perm {{0, 1, 2}}; + PhiView phi(dphi_data, + RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); + + + static const int tile_m = 25; + static const int tile_d = 90; + static const int tile_g = 0; + static const int tile_z = 40; + + + // + // Define statically dimensioned local arrays used in kernel + // + + using shmem_L_t = + RAJA::TypedLocalArray, IM, ID>; + shmem_L_t shmem_L; + + + using shmem_psi_t = + RAJA::TypedLocalArray, ID, IG, + IZ>; + shmem_psi_t shmem_psi; + + + // + // Define our execution policy + // + + using RAJA::Offsets; + using RAJA::Params; + using RAJA::Segs; + using RAJA::statement::Param; + + using EXECPOL = + RAJA::KernelPolicy, + // Tile outer m,d loops + statement::Tile< + 0, tile_fixed, seq_exec, // m + statement::Tile< + 1, tile_fixed, seq_exec, // d + + // Load L for m,d tile into shmem + statement::For<1, hip_thread_x_loop, // d + statement::For<0, hip_thread_y_direct, // m + statement::Lambda< + 0, Segs<0, 1>, Params<0>, + Offsets<0, 1>>>>, + statement::HipSyncThreads, + + // Distribute g, z across blocks and tile z + statement::For< + 2, hip_block_y_loop, // g + statement::Tile< + 3, tile_fixed, + hip_block_x_loop, // z + + // Load phi into thread local storage + statement::For< + 3, hip_thread_x_direct, // z + statement::For< + 0, hip_thread_y_direct, // m + statement::Lambda<2, Segs<0, 2, 3>, + Params<2>>>>, + + // Load slice of psi into shmem + statement::For< + 3, hip_thread_x_direct, // z + statement::For<1, hip_thread_y_loop, // d + // (reusing + // y) + statement::Lambda< + 1, Segs<1, 2, 3>, Params<1>, + Offsets<1, 2, 3>>>>, + statement::HipSyncThreads, + + // Compute phi + statement::For< + 3, hip_thread_x_direct, // z + statement::For< + 0, hip_thread_y_direct, // m + + // Compute thread-local Phi value and store + statement::For< + 1, seq_exec, // d + statement::Lambda< + 3, Segs<0, 1, 2, 3>, + Params<0, 1, 2>, + Offsets<0, 1, 2, 3>>> // d + > // m + >, // z + + // finish tile over directions + statement::HipSyncThreads, + + // Write out phi from thread local storage + statement::For< + 3, hip_thread_x_direct, // z + statement::For< + 0, hip_thread_y_direct, // m + statement::Lambda<4, Segs<0, 2, 3>, + Params<2>>>>, + statement::HipSyncThreads + + > // Tile z + > // g + + > // Tile d + > // Tile m + > // init shmem + > // HipKernelAsync + + >; // KernelPolicy - // - // Define statically dimensioned local arrays used in kernel - // - using shmem_L_t = RAJA::TypedLocalArray, - IM, ID>; - shmem_L_t shmem_L; - - - using shmem_psi_t = RAJA::TypedLocalArray, - ID, IG, IZ>; - shmem_psi_t shmem_psi; - - - - // - // Define our execution policy - // - - using RAJA::statement::Param; - using RAJA::Segs; - using RAJA::Params; - using RAJA::Offsets; - - using EXECPOL = - RAJA::KernelPolicy< - statement::HipKernelAsync< - statement::InitLocalMem, - // Tile outer m,d loops - statement::Tile<0, tile_fixed, seq_exec, // m - statement::Tile<1, tile_fixed, seq_exec, // d - - // Load L for m,d tile into shmem - statement::For<1, hip_thread_x_loop, // d - statement::For<0, hip_thread_y_direct, // m - statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> - > - >, - statement::HipSyncThreads, - - // Distribute g, z across blocks and tile z - statement::For<2, hip_block_y_loop, // g - statement::Tile<3, tile_fixed, hip_block_x_loop, // z - - // Load phi into thread local storage - statement::For<3, hip_thread_x_direct, // z - statement::For<0, hip_thread_y_direct, // m - statement::Lambda<2, Segs<0,2,3>, Params<2>> - > - >, - - // Load slice of psi into shmem - statement::For<3, hip_thread_x_direct, // z - statement::For<1, hip_thread_y_loop, // d (reusing y) - statement::Lambda<1, Segs<1,2,3>, Params<1>, Offsets<1,2,3>> - > - >, - statement::HipSyncThreads, - - // Compute phi - statement::For<3, hip_thread_x_direct, // z - statement::For<0, hip_thread_y_direct, // m - - // Compute thread-local Phi value and store - statement::For<1, seq_exec, // d - statement::Lambda<3, Segs<0,1,2,3>, Params<0,1,2>, Offsets<0,1,2,3>> - > // d - > // m - >, // z - - // finish tile over directions - statement::HipSyncThreads, - - // Write out phi from thread local storage - statement::For<3, hip_thread_x_direct, // z - statement::For<0, hip_thread_y_direct, // m - statement::Lambda<4, Segs<0,2,3>, Params<2>> - > - >, - statement::HipSyncThreads - - > // Tile z - > // g - - > // Tile d - > // Tile m - > // init shmem - > // HipKernelAsync - - >; // KernelPolicy - - - - - RAJA::Timer timer; - hipErrchk( hipDeviceSynchronize() ); - timer.start(); - - for (int iter = 0;iter < num_iter;++ iter) - RAJA::kernel_param( - RAJA::make_tuple( - RAJA::TypedRangeSegment(0, num_m), - RAJA::TypedRangeSegment(0, num_d), - RAJA::TypedRangeSegment(0, num_g), - RAJA::TypedRangeSegment(0, num_z)), - - // For kernel_param, second arg is a tuple of data objects used in lambdas. - // They are the last args in all lambdas (after indices). - // Here, the last entry '0.0' yields a thread-private temporary for - // computing a phi value, for shared memory before writing to phi array. - RAJA::make_tuple( shmem_L, - shmem_psi, - 0.0), - - // Lambda<0> : Load L into shmem - [=] RAJA_DEVICE (IM m, ID d, - shmem_L_t& sh_L, - IM tm, ID td) { - sh_L(tm, td) = L(m, d); - }, - - // Lambda<1> : Load slice of psi into shmem - [=] RAJA_DEVICE (ID d, IG g, IZ z, - shmem_psi_t& sh_psi, - ID td, IG tg, IZ tz) { - - sh_psi(td, tg, tz) = psi(d, g, z); - }, - - // Lambda<2> : Load thread-local phi value - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi_local = phi(m, g, z); - }, - - // Lambda<3> Compute thread-local phi value - [=] RAJA_DEVICE (IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), - IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z), - shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local, - IM tm, ID td, IG tg, IZ tz) { - - phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); - }, - - // Lambda<4> : Store phi - [=] RAJA_DEVICE (IM m, IG g, IZ z, - double& phi_local) { - - phi(m, g, z) = phi_local; - } - - ); - - hipDeviceSynchronize(); - timer.stop(); - double t = timer.elapsed(); - double gflop_rate = total_flops / t / 1.0e9; - std::cout << " RAJA HIP + shmem version of LTimes run time (sec.): " - << timer.elapsed() <<", GFLOPS/sec: " << gflop_rate << std::endl; + RAJA::Timer timer; + hipErrchk(hipDeviceSynchronize()); + timer.start(); + for (int iter = 0; iter < num_iter; ++iter) + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), + RAJA::TypedRangeSegment(0, num_d), + RAJA::TypedRangeSegment(0, num_g), + RAJA::TypedRangeSegment(0, num_z)), + + // For kernel_param, second arg is a tuple of data objects used in + // lambdas. They are the last args in all lambdas (after indices). + // Here, the last entry '0.0' yields a thread-private temporary for + // computing a phi value, for shared memory before writing to phi + // array. + RAJA::make_tuple(shmem_L, shmem_psi, 0.0), + + // Lambda<0> : Load L into shmem + [=] RAJA_DEVICE(IM m, ID d, shmem_L_t & sh_L, IM tm, ID td) + { sh_L(tm, td) = L(m, d); }, + + // Lambda<1> : Load slice of psi into shmem + [=] RAJA_DEVICE(ID d, IG g, IZ z, shmem_psi_t & sh_psi, ID td, IG tg, + IZ tz) { sh_psi(td, tg, tz) = psi(d, g, z); }, + + // Lambda<2> : Load thread-local phi value + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi_local = phi(m, g, z); }, + + // Lambda<3> Compute thread-local phi value + [=] RAJA_DEVICE(IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), + IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z), + shmem_L_t & sh_L, shmem_psi_t & sh_psi, + double& phi_local, IM tm, ID td, IG tg, IZ tz) + { phi_local += sh_L(tm, td) * sh_psi(td, tg, tz); }, + + // Lambda<4> : Store phi + [=] RAJA_DEVICE(IM m, IG g, IZ z, double& phi_local) + { phi(m, g, z) = phi_local; } + + ); + + hipDeviceSynchronize(); + timer.stop(); + double t = timer.elapsed(); + double gflop_rate = total_flops / t / 1.0e9; + std::cout << " RAJA HIP + shmem version of LTimes run time (sec.): " + << timer.elapsed() << ", GFLOPS/sec: " << gflop_rate << std::endl; #if defined(DEBUG_LTIMES) - hipErrchk( hipMemcpy( phi_data, dphi_data, phi_size * sizeof(double), - hipMemcpyDeviceToHost ) ); + hipErrchk(hipMemcpy(phi_data, dphi_data, phi_size * sizeof(double), + hipMemcpyDeviceToHost)); - // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); - checkResult(phi, L, psi, num_m, num_d, num_g, num_z); + // Reset data in Views to CPU data + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); + checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif - hipErrchk( hipFree( dL_data ) ); - hipErrchk( hipFree( dpsi_data ) ); - hipErrchk( hipFree( dphi_data ) ); -} + hipErrchk(hipFree(dL_data)); + hipErrchk(hipFree(dpsi_data)); + hipErrchk(hipFree(dphi_data)); + } #endif -//----------------------------------------------------------------------------// + //----------------------------------------------------------------------------// std::cout << "\n DONE!...\n"; @@ -2039,36 +1968,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Function to check result and report P/F. // template -void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, +void checkResult(PHIVIEW_T& phi, + LVIEW_T& L, + PSIVIEW_T& psi, const int num_m, const int num_d, const int num_g, const int num_z) { - size_t nerrors = 0; + size_t nerrors = 0; double total_error = 0.0; - for (IM m(0); m < num_m; ++m) { - for (IG g(0); g < num_g; ++g) { - for (IZ z(0); z < num_z; ++z) { + for (IM m(0); m < num_m; ++m) + { + for (IG g(0); g < num_g; ++g) + { + for (IZ z(0); z < num_z; ++z) + { double total = 0.0; - for (ID d(0); d < num_d; ++d) { + for (ID d(0); d < num_d; ++d) + { double val = L(m, d) * psi(d, g, z); total += val; } - if (std::abs(total-phi(m, g, z)) > 1e-9) { - printf("ERR: g=%d, z=%d, m=%d, val=%.12e, expected=%.12e\n", - (int)*g, (int)*z, (int)*m, phi(m,g,z), total); + if (std::abs(total - phi(m, g, z)) > 1e-9) + { + printf("ERR: g=%d, z=%d, m=%d, val=%.12e, expected=%.12e\n", (int)*g, + (int)*z, (int)*m, phi(m, g, z), total); ++nerrors; } - total_error += std::abs(total-phi(m, g, z)); + total_error += std::abs(total - phi(m, g, z)); } } } - if ( nerrors == 0 ) { + if (nerrors == 0) + { std::cout << "\n\t result -- PASS\n"; - } else { + } + else + { std::cout << "\n\t result -- FAIL : " << nerrors << " errors!\n"; } } diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp index 331d6c51dd..3ed7dcd11d 100644 --- a/benchmark/raja_view_blur.cpp +++ b/benchmark/raja_view_blur.cpp @@ -15,117 +15,107 @@ * */ -//Uncomment to specify variant -//#define RUN_HIP_VARIANT -//#define RUN_CUDA_VARIANT -//#define RUN_SYCL_VARIANT -//#define RUN_OPENMP_VARIANT +// Uncomment to specify variant +// #define RUN_HIP_VARIANT +// #define RUN_CUDA_VARIANT +// #define RUN_SYCL_VARIANT +// #define RUN_OPENMP_VARIANT #define RUN_SEQ_VARIANT -using host_pol = RAJA::seq_exec; +using host_pol = RAJA::seq_exec; using host_resources = RAJA::resources::Host; #if defined(RAJA_ENABLE_HIP) && defined(RUN_HIP_VARIANT) -using device_pol = RAJA::hip_exec<256>; +using device_pol = RAJA::hip_exec<256>; using device_resources = RAJA::resource::Hip; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::HipKernelFixed<256, - RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>, - RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>, - RAJA::statement::Lambda<0> - > - > - > - >; +using kernel_pol = RAJA::KernelPolicy, + RAJA::statement::For<0, + RAJA::hip_global_size_x_direct<16>, + RAJA::statement::Lambda<0>>>>>; #endif #if defined(RAJA_ENABLE_CUDA) && defined(RUN_CUDA_VARIANT) -using device_pol = RAJA::cuda_exec<256>; +using device_pol = RAJA::cuda_exec<256>; using device_resources = RAJA::resources::Cuda; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixed<256, - RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>, - RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>, - RAJA::statement::Lambda<0> - > - > - > - >; +using kernel_pol = RAJA::KernelPolicy, + RAJA::statement::For<0, + RAJA::cuda_global_size_x_direct<16>, + RAJA::statement::Lambda<0>>>>>; #endif #if defined(RAJA_ENABLE_SYCL) && defined(RUN_SYCL_VARIANT) -using device_pol = RAJA::sycl_exec<256>; +using device_pol = RAJA::sycl_exec<256>; using device_resources = RAJA::resources::Sycl; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::SyclKernel< - RAJA::statement::For<1, RAJA::sycl_global_item_1, - RAJA::statement::For<0, RAJA::sycl_global_item_2, - RAJA::statement::Lambda<0> - > - > - > - >; +using kernel_pol = + RAJA::KernelPolicy>>>>; #endif #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP_VARIANT) -using device_pol = RAJA::omp_parallel_for_exec; +using device_pol = RAJA::omp_parallel_for_exec; using device_resources = RAJA::resources::Host; -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - >; +using kernel_pol = RAJA::KernelPolicy>>>; #endif #if defined(RUN_SEQ_VARIANT) -using device_pol = RAJA::seq_exec; -using device_resources = RAJA::resources::Host; - -using kernel_pol = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, - RAJA::statement::Lambda<0> - > - > - >; +using device_pol = RAJA::seq_exec; +using device_resources = RAJA::resources::Host; + +using kernel_pol = RAJA::KernelPolicy>>>; #endif -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { const int N = 10000; const int K = 17; - device_resources def_device_res{device_resources::get_default()}; - host_resources def_host_res{host_resources::get_default()}; + device_resources def_device_res {device_resources::get_default()}; + host_resources def_host_res {host_resources::get_default()}; auto timer = RAJA::Timer(); - //launch to intialize the stream - RAJA::forall - (RAJA::RangeSegment(0,1), [=] RAJA_HOST_DEVICE (int i) { - }); + // launch to intialize the stream + RAJA::forall(RAJA::RangeSegment(0, 1), + [=] RAJA_HOST_DEVICE(int i) {}); - int * array = def_host_res.allocate(N * N); - int * array_copy = def_host_res.allocate(N * N); + int* array = def_host_res.allocate(N * N); + int* array_copy = def_host_res.allocate(N * N); - //big array, or image - for (int i = 0; i < N * N; ++i) { - array[i] = 1; + // big array, or image + for (int i = 0; i < N * N; ++i) + { + array[i] = 1; array_copy[i] = 1; } - //small array that acts as the blur - int * kernel = def_host_res.allocate(K * K); - for (int i = 0; i < K * K; ++i) { + // small array that acts as the blur + int* kernel = def_host_res.allocate(K * K); + for (int i = 0; i < K * K; ++i) + { kernel[i] = 2; } @@ -140,7 +130,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int DIM = 2; RAJA::View> array_view(d_array, N, N); - RAJA::View> array_view_copy(d_array_copy, N, N); + RAJA::View> array_view_copy(d_array_copy, N, + N); RAJA::View> kernel_view(d_kernel, K, K); RAJA::RangeSegment range_i(0, N); @@ -148,60 +139,68 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); - RAJA::kernel - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_HOST_DEVICE (int i, int j) { - int sum = 0; - - //looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += kernel_view(m, n) * array_view(x, y); - } - } - } - - array_view(i, j) += sum; - } - ); + RAJA::kernel(RAJA::make_tuple(range_i, range_j), + [=] RAJA_HOST_DEVICE(int i, int j) + { + int sum = 0; + + // looping through the "blur" + for (int m = 0; m < K; ++m) + { + for (int n = 0; n < K; ++n) + { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever + // the blur is located on the image + if (x < N && y < N) + { + sum += kernel_view(m, n) * array_view(x, y); + } + } + } + + array_view(i, j) += sum; + }); timer.stop(); - std::cout<<"Elapsed time with RAJA view : "< - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_HOST_DEVICE (int i, int j) { - int sum = 0; - - // looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += d_kernel[m * K + n] * d_array_copy[x * N + y]; - } - } - } + timer.reset(); + timer.start(); - d_array_copy[i * N + j] += sum; - } - ); + RAJA::kernel(RAJA::make_tuple(range_i, range_j), + [=] RAJA_HOST_DEVICE(int i, int j) + { + int sum = 0; + + // looping through the "blur" + for (int m = 0; m < K; ++m) + { + for (int n = 0; n < K; ++n) + { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever + // the blur is located on the image + if (x < N && y < N) + { + sum += d_kernel[m * K + n] * + d_array_copy[x * N + y]; + } + } + } + + d_array_copy[i * N + j] += sum; + }); timer.stop(); - std::cout<<"Elapsed time with NO RAJA view : "<