diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c08751ff5..560aee898f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,6 +184,7 @@ endif() if(RAJA_ENABLE_HIP) message(STATUS "HIP version: ${hip_VERSION}") + set(RAJA_HIP_WAVESIZE "64" CACHE STRING "Set the wave size for GPU architecture. E.g. MI200/MI300 this is 64.") if("${hip_VERSION}" VERSION_LESS "3.5") message(FATAL_ERROR "Trying to use HIP/ROCm version ${hip_VERSION}. RAJA requires HIP/ROCm version 3.5 or newer. ") endif() diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp index 5131010bd6..d405408e7f 100644 --- a/examples/dynamic-forall.cpp +++ b/examples/dynamic-forall.cpp @@ -77,28 +77,25 @@ int main(int argc, char *argv[]) //----------------------------------------------------------------------------// - std::cout << "\n Running C-style vector addition...\n"; - - // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { - c[i] = a[i] + b[i]; - } - // _cstyle_vector_add_end - - checkResult(c, N); -//printResult(c, N); - - -//----------------------------------------------------------------------------// -// Example of dynamic policy selection for forall -//----------------------------------------------------------------------------// + std::cout << "\n Running dynamic forall vector addition and reductions...\n"; + int sum = 0; + using VAL_INT_SUM = RAJA::expt::ValOp; + + RAJA::RangeSegment range(0, N); + //policy is chosen from the list - RAJA::expt::dynamic_forall(pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { + RAJA::dynamic_forall(pol, range, + RAJA::expt::Reduce(&sum), + RAJA::expt::KernelName("RAJA dynamic forall"), + [=] RAJA_HOST_DEVICE (int i, VAL_INT_SUM &_sum) { + c[i] = a[i] + b[i]; + _sum += 1; }); // _rajaseq_vector_add_end + std::cout << "Sum = " << sum << ", expected sum: " << N << std::endl; checkResult(c, N); //printResult(c, N); @@ -126,9 +123,9 @@ void checkResult(int* res, int len) if ( res[i] != 0 ) { correct = false; } } if ( correct ) { - std::cout << "\n\t result -- PASS\n"; + std::cout << "\n\t Vector sum result -- PASS\n"; } else { - std::cout << "\n\t result -- FAIL\n"; + std::cout << "\n\t Vector sum result -- FAIL\n"; } } diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp index 62d3d6e3e7..685a572033 100644 --- a/examples/memoryManager.hpp +++ b/examples/memoryManager.hpp @@ -76,7 +76,7 @@ void deallocate(T *&ptr) hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) auto qu = sycl_res->get().get_queue(); - ptr = cl::sycl::malloc_device(size, *qu); + ptr = ::sycl::malloc_device(size, *qu); #endif return ptr; } diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp index 0b35017fac..ac6fd62398 100644 --- a/examples/resource-dynamic-forall.cpp +++ b/examples/resource-dynamic-forall.cpp @@ -121,7 +121,7 @@ int main(int argc, char *argv[]) RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); #endif - RAJA::expt::dynamic_forall + RAJA::dynamic_forall (res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { c[i] = a[i] + b[i]; diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp index 62d3d6e3e7..685a572033 100644 --- a/exercises/memoryManager.hpp +++ b/exercises/memoryManager.hpp @@ -76,7 +76,7 @@ void deallocate(T *&ptr) hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); #elif defined(RAJA_ENABLE_SYCL) auto qu = sycl_res->get().get_queue(); - ptr = cl::sycl::malloc_device(size, *qu); + ptr = ::sycl::malloc_device(size, *qu); #endif return ptr; } diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 380418efa1..29d97fed69 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -182,6 +182,8 @@ static_assert(RAJA_HAS_SOME_CXX14, #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT #cmakedefine RAJA_ENABLE_ROCTX +#cmakedefine RAJA_HIP_WAVESIZE @RAJA_HIP_WAVESIZE@ + /*! ****************************************************************************** * diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp index d1aa835324..764b913cd6 100644 --- a/include/RAJA/pattern/forall.hpp +++ b/include/RAJA/pattern/forall.hpp @@ -659,104 +659,99 @@ RAJA_INLINE camp::resources::EventProxy CallForallIcount::operator()(T cons // - Returns a generic event proxy only if a resource is provided // avoids overhead of constructing a typed erased resource // -namespace expt +template +struct dynamic_helper { - - template - struct dynamic_helper + template + static void invoke_forall(const int pol, SEGMENT const &seg, PARAMS&&... params) { - template - static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body) - { - if(IDX==pol){ - using t_pol = typename camp::at>::type; - RAJA::forall(seg, body); - return; - } - dynamic_helper::invoke_forall(pol, seg, body); - } - - template - static resources::EventProxy - invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) - { - + if(IDX==pol){ using t_pol = typename camp::at>::type; - using resource_type = typename resources::get_resource::type; - - if(IDX==pol){ - RAJA::forall(r.get(), seg, body); - - //Return a generic event proxy from r, - //because forall returns a typed event proxy - return {r}; - } - - return dynamic_helper::invoke_forall(r, pol, seg, body); + RAJA::forall(seg, params...); + return; } + dynamic_helper::invoke_forall(pol, seg, params...); + } - }; - - template - struct dynamic_helper<0, POLICY_LIST> + template + static resources::EventProxy + invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, PARAMS&&... params) { - template - static void - invoke_forall(const int pol, SEGMENT const &seg, BODY const &body) - { - if(0==pol){ - using t_pol = typename camp::at>::type; - RAJA::forall(seg, body); - return; - } - RAJA_ABORT_OR_THROW("Policy enum not supported "); - } - template - static resources::EventProxy - invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) - { - if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range "); - - using t_pol = typename camp::at>::type; - using resource_type = typename resources::get_resource::type; + using t_pol = typename camp::at>::type; + using resource_type = typename resources::get_resource::type; - RAJA::forall(r.get(), seg, body); + if(IDX==pol){ + RAJA::forall(r.get(), seg, params...); //Return a generic event proxy from r, //because forall returns a typed event proxy return {r}; } - }; + return dynamic_helper::invoke_forall(r, pol, seg, params...); + } - template - void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body) - { - constexpr int N = camp::size::value; - static_assert(N > 0, "RAJA policy list must not be empty"); +}; - if(pol > N-1) { - RAJA_ABORT_OR_THROW("Policy enum not supported"); +template +struct dynamic_helper<0, POLICY_LIST> +{ + template + static void + invoke_forall(const int pol, SEGMENT const &seg, PARAMS&&... params) + { + if(0==pol){ + using t_pol = typename camp::at>::type; + RAJA::forall(seg, params...); + return; } - dynamic_helper::invoke_forall(pol, seg, body); + RAJA_ABORT_OR_THROW("Policy enum not supported "); } - template - resources::EventProxy - dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) + template + static resources::EventProxy + invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, PARAMS&&... params) { - constexpr int N = camp::size::value; - static_assert(N > 0, "RAJA policy list must not be empty"); + if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range "); - if(pol > N-1) { - RAJA_ABORT_OR_THROW("Policy value out of range"); - } + using t_pol = typename camp::at>::type; + using resource_type = typename resources::get_resource::type; + + RAJA::forall(r.get(), seg, params...); + + //Return a generic event proxy from r, + //because forall returns a typed event proxy + return {r}; + } + +}; - return dynamic_helper::invoke_forall(r, pol, seg, body); +template +void dynamic_forall(const int pol, SEGMENT const &seg, PARAMS&&... params) +{ + constexpr int N = camp::size::value; + static_assert(N > 0, "RAJA policy list must not be empty"); + + if(pol > N-1) { + RAJA_ABORT_OR_THROW("Policy enum not supported"); + } + dynamic_helper::invoke_forall(pol, seg, params...); +} + +template +resources::EventProxy +dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, PARAMS&&... params) +{ + constexpr int N = camp::size::value; + static_assert(N > 0, "RAJA policy list must not be empty"); + + if(pol > N-1) { + RAJA_ABORT_OR_THROW("Policy value out of range"); } -} // namespace expt + return dynamic_helper::invoke_forall(r, pol, seg, params...); +} } // namespace RAJA diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index f1d70aeacb..7ea7ce57ef 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -162,7 +162,7 @@ class LaunchContext void *shared_mem_ptr; #if defined(RAJA_ENABLE_SYCL) - mutable cl::sycl::nd_item<3> *itm; + mutable ::sycl::nd_item<3> *itm; #endif RAJA_HOST_DEVICE LaunchContext() diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index a9f9027675..040de50f31 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -324,8 +324,9 @@ struct DeviceConstants // values for HIP warp size and max block size. // #if defined(__HIP_PLATFORM_AMD__) -constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A -// constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X +constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 64); // MI300A +// constexpr DeviceConstants device_constants(RAJA_HIP_WAVESIZE, 1024, 128); // MI250X + #elif defined(__HIP_PLATFORM_NVIDIA__) constexpr DeviceConstants device_constants(32, 1024, 32); // V100 #endif diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp index 081a88dc23..1a8c9930dd 100644 --- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp +++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp @@ -50,7 +50,7 @@ namespace detail struct syclInfo { sycl_dim_t gridDim{0}; sycl_dim_t blockDim{0}; - cl::sycl::queue qu = cl::sycl::queue(); + ::sycl::queue qu = ::sycl::queue(); bool setup_reducers = false; #if defined(RAJA_ENABLE_OPENMP) syclInfo* thread_states = nullptr; @@ -62,7 +62,7 @@ extern syclInfo g_status; extern syclInfo tl_status; -extern std::unordered_map g_queue_info_map; +extern std::unordered_map<::sycl::queue, bool> g_queue_info_map; } // namespace detail diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp index 901cc694f0..1c6876e328 100644 --- a/include/RAJA/policy/sycl/forall.hpp +++ b/include/RAJA/policy/sycl/forall.hpp @@ -206,8 +206,8 @@ resources::EventProxy forall_impl(resources::Sycl &sycl_res, }).wait(); // Need to wait for completion to free memory // Free our device memory - cl::sycl::free(lbody, *q); - cl::sycl::free(beg, *q); + ::sycl::free(lbody, *q); + ::sycl::free(beg, *q); RAJA_FT_END; } diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp index 9149418518..e2e6b09e6d 100644 --- a/include/RAJA/policy/sycl/kernel/Conditional.hpp +++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp @@ -51,7 +51,7 @@ struct SyclStatementExecutor item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { if (Conditional::eval(data)) { diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp index d0976b931f..2019bfa7a9 100644 --- a/include/RAJA/policy/sycl/kernel/For.hpp +++ b/include/RAJA/policy/sycl/kernel/For.hpp @@ -59,7 +59,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { auto len = segment_length(data); auto i = item.get_global_id(Dim); @@ -124,7 +124,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { auto len = segment_length(data); auto i = item.get_group(Dim); @@ -187,7 +187,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { auto len = segment_length(data); auto i0 = item.get_group(Dim); @@ -253,7 +253,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { auto len = segment_length(data); auto i = item.get_local_id(Dim); @@ -317,7 +317,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { auto len = segment_length(data); auto i0 = item.get_local_id(Dim); @@ -393,7 +393,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item) { auto len = segment_length(data); auto i = item.get_global_id(0); @@ -454,7 +454,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { using idx_type = camp::decay(data.offset_tuple))>; diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp index 9c25bb0ab9..fcd3b1824d 100644 --- a/include/RAJA/policy/sycl/kernel/ForICount.hpp +++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp @@ -63,7 +63,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { diff_t len = segment_length(data); auto i = item.get_local_id(ThreadDim); @@ -121,7 +121,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { diff_t len = segment_length(data); auto i0 = item.get_local_id(0); @@ -181,7 +181,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // masked size strided loop diff_t len = segment_length(data); @@ -243,7 +243,7 @@ struct SyclStatementExecutor< using typename Base::diff_t; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // block stride loop diff_t len = segment_length(data); @@ -300,7 +300,7 @@ struct SyclStatementExecutor< using typename Base::diff_t; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // grid stride loop diff_t len = segment_length(data); @@ -349,7 +349,7 @@ struct SyclStatementExecutor< using typename Base::diff_t; static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // grid stride loop diff_t len = segment_length(data); @@ -399,7 +399,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { diff_t len = segment_length(data); diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp index 0542f4b81e..05f4fb3a44 100644 --- a/include/RAJA/policy/sycl/kernel/Lambda.hpp +++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp @@ -46,7 +46,7 @@ template , Types> { static - inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Only execute the lambda if it hasn't been masked off if(thread_active){ diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp index 88c789c062..f339bccef5 100644 --- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp +++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp @@ -93,7 +93,7 @@ namespace internal * SYCL global function for launching SyclKernel policies. */ template -void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item) +void SyclKernelLauncher(Data data, ::sycl::nd_item<3> item) { using data_t = camp::decay; @@ -128,7 +128,7 @@ struct SyclLaunchHelper,StmtList,Data,Types> static void launch(Data &&data, internal::LaunchDims launch_dims, size_t shmem, - cl::sycl::queue* qu) + ::sycl::queue* qu) { // @@ -136,20 +136,20 @@ struct SyclLaunchHelper,StmtList,Data,Types> // Kernel body is nontrivially copyable, create space on device and copy to // Workaround until "is_device_copyable" is supported // - data_t* m_data = (data_t*) cl::sycl::malloc_device(sizeof(data_t), *qu); + data_t* m_data = (data_t*) ::sycl::malloc_device(sizeof(data_t), *qu); qu->memcpy(m_data, &data, sizeof(data_t)).wait(); - qu->submit([&](cl::sycl::handler& h) { + qu->submit([&](::sycl::handler& h) { h.parallel_for(launch_dims.fit_nd_range(qu), - [=] (cl::sycl::nd_item<3> item) { + [=] (::sycl::nd_item<3> item) { SyclKernelLauncher(*m_data, item); }); }).wait(); // Need to wait to free memory - cl::sycl::free(m_data, *qu); + ::sycl::free(m_data, *qu); } }; @@ -172,13 +172,13 @@ struct SyclLaunchHelper,StmtList,Data,Types> static void launch(Data &&data, internal::LaunchDims launch_dims, size_t shmem, - cl::sycl::queue* qu) + ::sycl::queue* qu) { - qu->submit([&](cl::sycl::handler& h) { + qu->submit([&](::sycl::handler& h) { h.parallel_for(launch_dims.fit_nd_range(qu), - [=] (cl::sycl::nd_item<3> item) { + [=] (::sycl::nd_item<3> item) { SyclKernelLauncher(data, item); diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp index 81a57cdecb..59590b2556 100644 --- a/include/RAJA/policy/sycl/kernel/Tile.hpp +++ b/include/RAJA/policy/sycl/kernel/Tile.hpp @@ -61,7 +61,7 @@ struct SyclStatementExecutor< using enclosed_stmts_t = SyclStatementListExecutor; using diff_t = segment_diff_type; - static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){ + static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active){ // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -139,7 +139,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; - static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -231,7 +231,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; - static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -321,7 +321,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; - static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -409,7 +409,7 @@ struct SyclStatementExecutor< using diff_t = segment_diff_type; - static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + static inline RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp index b1d263a263..f27dafca80 100644 --- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp +++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp @@ -70,7 +70,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){ + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active){ // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -141,7 +141,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -214,7 +214,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -289,7 +289,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); @@ -363,7 +363,7 @@ struct SyclStatementExecutor< static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Get the segment referenced by this Tile statement auto &segment = camp::get(data.segment_tuple); diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp index 56e3a9aa1e..3498550c25 100644 --- a/include/RAJA/policy/sycl/kernel/internal.hpp +++ b/include/RAJA/policy/sycl/kernel/internal.hpp @@ -86,7 +86,7 @@ struct LaunchDims { return result; } - cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) { + ::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) { sycl_dim_3_t launch_global; @@ -95,9 +95,9 @@ struct LaunchDims { launch_local.y = std::max(launch_local.y, local.y); launch_local.z = std::max(launch_local.z, local.z); - cl::sycl::device dev = q->get_device(); + ::sycl::device dev = q->get_device(); - auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>(); + auto max_work_group_size = dev.get_info< ::sycl::info::device::max_work_group_size>(); if(launch_local.x > max_work_group_size) { launch_local.x = max_work_group_size; @@ -160,10 +160,10 @@ struct LaunchDims { launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; } - cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z}; - cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z}; + ::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z}; + ::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z}; - return cl::sycl::nd_range<3>(ret_gl, ret_th); + return ::sycl::nd_range<3>(ret_gl, ret_th); } }; @@ -176,7 +176,7 @@ struct SyclStatementListExecutorHelper { using cur_stmt_t = camp::at_v; template - inline static RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + inline static RAJA_DEVICE void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Execute stmt cur_stmt_t::exec(data, item, thread_active); @@ -203,7 +203,7 @@ template struct SyclStatementListExecutorHelper { template - inline static RAJA_DEVICE void exec(Data &, cl::sycl::nd_item<3> item, bool) + inline static RAJA_DEVICE void exec(Data &, ::sycl::nd_item<3> item, bool) { // nop terminator } @@ -233,7 +233,7 @@ struct SyclStatementListExecutor, Types> { static inline RAJA_DEVICE - void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active) + void exec(Data &data, ::sycl::nd_item<3> item, bool thread_active) { // Execute statements in order with helper class SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, item, thread_active); diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index ad9fecc222..c8bc7aab53 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -63,13 +63,13 @@ struct LaunchExecute> { RAJA_FT_BEGIN; - q->submit([&](cl::sycl::handler& h) { + q->submit([&](::sycl::handler& h) { auto s_vec = ::sycl::local_accessor (params.shared_mem_size, h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), - [=] (cl::sycl::nd_item<3> itm) { + (::sycl::nd_range<3>(gridSize, blockSize), + [=] (::sycl::nd_item<3> itm) { LaunchContext ctx; ctx.itm = &itm; @@ -136,14 +136,14 @@ struct LaunchExecute> { RAJA::expt::ParamMultiplexer::init(*res); auto reduction = ::sycl::reduction(res, launch_reducers, combiner); - q->submit([&](cl::sycl::handler& h) { + q->submit([&](::sycl::handler& h) { auto s_vec = ::sycl::local_accessor (launch_params.shared_mem_size, h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), + (::sycl::nd_range<3>(gridSize, blockSize), reduction, - [=] (cl::sycl::nd_item<3> itm, auto & red) { + [=] (::sycl::nd_item<3> itm, auto & red) { LaunchContext ctx; ctx.itm = &itm; @@ -211,16 +211,16 @@ struct LaunchExecute> { // using LOOP_BODY = camp::decay; LOOP_BODY* lbody; - lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q); + lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q); q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait(); - q->submit([&](cl::sycl::handler& h) { + q->submit([&](::sycl::handler& h) { auto s_vec = ::sycl::local_accessor (params.shared_mem_size, h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), - [=] (cl::sycl::nd_item<3> itm) { + (::sycl::nd_range<3>(gridSize, blockSize), + [=] (::sycl::nd_item<3> itm) { LaunchContext ctx; ctx.itm = &itm; @@ -234,7 +234,7 @@ struct LaunchExecute> { }).wait(); // Need to wait for completion to free memory - cl::sycl::free(lbody, *q); + ::sycl::free(lbody, *q); RAJA_FT_END; @@ -290,21 +290,21 @@ struct LaunchExecute> { // using LOOP_BODY = camp::decay; LOOP_BODY* lbody; - lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q); + lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q); q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait(); ReduceParams* res = ::sycl::malloc_shared(1,*q); RAJA::expt::ParamMultiplexer::init(*res); auto reduction = ::sycl::reduction(res, launch_reducers, combiner); - q->submit([&](cl::sycl::handler& h) { + q->submit([&](::sycl::handler& h) { auto s_vec = ::sycl::local_accessor (launch_params.shared_mem_size, h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), + (::sycl::nd_range<3>(gridSize, blockSize), reduction, - [=] (cl::sycl::nd_item<3> itm, auto & red) { + [=] (::sycl::nd_item<3> itm, auto & red) { LaunchContext ctx; ctx.itm = &itm; @@ -325,7 +325,7 @@ struct LaunchExecute> { RAJA::expt::ParamMultiplexer::combine( launch_reducers, *res ); ::sycl::free(res, *q); - cl::sycl::free(lbody, *q); + ::sycl::free(lbody, *q); RAJA_FT_END; } diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp index 0f92fe27e1..a02c9ea30f 100644 --- a/include/RAJA/policy/sycl/policy.hpp +++ b/include/RAJA/policy/sycl/policy.hpp @@ -39,7 +39,7 @@ struct uint3 { unsigned long x, y, z; }; -using sycl_dim_t = cl::sycl::range<1>; +using sycl_dim_t = ::sycl::range<1>; using sycl_dim_3_t = uint3; diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp index 49d89b3cd2..8a84d5b412 100644 --- a/include/RAJA/policy/sycl/reduce.hpp +++ b/include/RAJA/policy/sycl/reduce.hpp @@ -107,11 +107,11 @@ struct Reduce_Data Reduce_Data(T initValue, T identityValue, Offload_Info &info) : value(initValue) { - cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); + ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); - device = reinterpret_cast(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q))); - host = reinterpret_cast(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q))); + device = reinterpret_cast(::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q))); + host = reinterpret_cast(::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q))); if (!host) { printf("Unable to allocate space on host\n"); @@ -136,7 +136,7 @@ struct Reduce_Data //! transfers from the host to the device -- exit() is called upon failure RAJA_INLINE void hostToDevice(Offload_Info &info) { - cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); + ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); if(!q) { camp::resources::Resource res = camp::resources::Sycl(); @@ -154,7 +154,7 @@ struct Reduce_Data //! transfers from the device to the host -- exit() is called upon failure RAJA_INLINE void deviceToHost(Offload_Info &info) { - cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); + ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); if(!q) { camp::resources::Resource res = camp::resources::Sycl(); @@ -172,14 +172,14 @@ struct Reduce_Data //! frees all data from the offload information passed RAJA_INLINE void cleanup(Offload_Info &info) { - cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); + ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue(); if (device) { - cl::sycl::free(reinterpret_cast(device), *q); + ::sycl::free(reinterpret_cast(device), *q); device = nullptr; } if (host) { - cl::sycl::free(reinterpret_cast(host), *q); + ::sycl::free(reinterpret_cast(host), *q); //delete[] host; host = nullptr; } @@ -243,8 +243,8 @@ struct TargetReduce TargetReduce &reduce(T rhsVal) { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(val.device[i]); + auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(val.device[i]); Reducer{}(atm, rhsVal); return *this; #else @@ -257,8 +257,8 @@ struct TargetReduce const TargetReduce &reduce(T rhsVal) const { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(val.device[i]); + auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(val.device[i]); Reducer{}(atm, rhsVal); return *this; #else @@ -356,10 +356,10 @@ struct TargetReduceLoc TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - cl::sycl::atomic_fence(cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device); + auto i = 0; //__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + ::sycl::atomic_fence(::sycl::memory_order_acquire, ::sycl::memory_scope::device); Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc); - cl::sycl::atomic_fence(cl::sycl::memory_order_release, cl::sycl::memory_scope::device); + ::sycl::atomic_fence(::sycl::memory_order_release, ::sycl::memory_scope::device); return *this; #else Reducer{}(val.value, loc.value, rhsVal, rhsLoc); @@ -415,8 +415,8 @@ class ReduceSum const self &operator+=(T rhsVal) const { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm.fetch_add(rhsVal); return *this; #else @@ -441,8 +441,8 @@ class ReduceBitOr self &operator|=(T rhsVal) { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm |= rhsVal; return *this; #else @@ -455,8 +455,8 @@ class ReduceBitOr const self &operator|=(T rhsVal) const { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm |= rhsVal; return *this; #else @@ -481,8 +481,8 @@ class ReduceBitAnd self &operator&=(T rhsVal) { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm &= rhsVal; return *this; #else @@ -495,8 +495,8 @@ class ReduceBitAnd const self &operator&=(T rhsVal) const { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm &= rhsVal; return *this; #else @@ -522,8 +522,8 @@ class ReduceMin self &min(T rhsVal) { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm.fetch_min(rhsVal); return *this; #else @@ -536,8 +536,8 @@ class ReduceMin const self &min(T rhsVal) const { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm.fetch_min(rhsVal); return *this; #else @@ -563,8 +563,8 @@ class ReduceMax self &max(T rhsVal) { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm.fetch_max(rhsVal); return *this; #else @@ -577,8 +577,8 @@ class ReduceMax const self &max(T rhsVal) const { #ifdef __SYCL_DEVICE_ONLY__ - auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0]; - auto atm = ::sycl::atomic_ref(parent::val.device[i]); + auto i = 0;//__spirv::initLocalInvocationId<1, ::sycl::id<1>>()[0]; + auto atm = ::sycl::atomic_ref(parent::val.device[i]); atm.fetch_max(rhsVal); return *this; #else diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp index 74bbc2f077..f1810807f9 100644 --- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp +++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp @@ -57,7 +57,7 @@ namespace expt public: - static constexpr int s_num_elem = 64; + static constexpr int s_num_elem = policy::hip::device_constants.WARP_SIZE; /*! * @brief Default constructor, zeros register contents @@ -780,8 +780,8 @@ namespace expt // Third: mask off everything but output_segment // this is because all output segments are valid at this point - // (5-segbits), the 5 is since the warp-width is 32 == 1<<5 - int our_output_segment = get_lane()>>(6-segbits); + static constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); + int our_output_segment = get_lane()>>(log2_warp_size-segbits); bool in_output_segment = our_output_segment == output_segment; if(!in_output_segment){ result.get_raw_value() = 0; @@ -828,8 +828,9 @@ namespace expt // First: tree reduce values within each segment element_type x = m_value; + static constexpr int log2_warp_size = RAJA::log2(RAJA::policy::hip::device_constants.WARP_SIZE); RAJA_UNROLL - for(int i = 0;i < 6-segbits; ++ i){ + for(int i = 0;i < log2_warp_size-segbits; ++ i){ // tree shuffle int delta = s_num_elem >> (i+1); diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp index 4c4d959599..1b8a9679bb 100644 --- a/include/RAJA/policy/tensor/arch/hip/traits.hpp +++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp @@ -29,7 +29,8 @@ namespace expt { struct RegisterTraits{ using element_type = T; using register_policy = RAJA::expt::hip_wave_register; - static constexpr camp::idx_t s_num_elem = 64; + static constexpr camp::idx_t s_num_elem = policy::hip::device_constants.WARP_SIZE; + static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem; using int_element_type = int32_t; }; diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 815928e434..6be479a535 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -13,7 +13,7 @@ if [[ $# -lt 1 ]]; then echo " 1) SYCL compiler installation path" echo echo "For example: " - echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1" + echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_730cd3a5275f_hip_gcc10.3.1_rocm6.0.2" exit fi diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp index 0b5f1b8be6..3a3976c675 100644 --- a/src/MemUtils_SYCL.cpp +++ b/src/MemUtils_SYCL.cpp @@ -49,8 +49,8 @@ syclInfo tl_status; #endif //! State of raja sycl queue synchronization for sycl reducer objects -std::unordered_map g_queue_info_map{ - {cl::sycl::queue(), true}}; +std::unordered_map<::sycl::queue, bool> g_queue_info_map{ + {::sycl::queue(), true}}; } // namespace detail diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp index 8c8d051d8f..dde4273abe 100644 --- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp +++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp @@ -34,7 +34,7 @@ void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin); - RAJA::expt::dynamic_forall(working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { + RAJA::dynamic_forall(working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; }); diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp index 11168b0e30..6f13f07cf5 100644 --- a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp +++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp @@ -40,7 +40,7 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin); - RAJA::expt::dynamic_forall(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { + RAJA::dynamic_forall(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { working_array[RAJA::stripIndexType(idx - rbegin)] = idx; }); @@ -50,7 +50,7 @@ void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len); - RAJA::expt::dynamic_forall(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { + RAJA::dynamic_forall(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) { (void) idx; working_array[0]++; }); diff --git a/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in b/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in index 9b54015cda..76c8371df9 100644 --- a/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in +++ b/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in @@ -80,11 +80,11 @@ using OpenMPKernelExecPols = camp::list< using OpenMPTargetKernelExecPols = camp::list< - RAJA::KernelPolicy< - RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec<4>, - RAJA::statement::Lambda<0, RAJA::Segs<0>> - > - >, +//RAJA::KernelPolicy< +// RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec<4>, +// RAJA::statement::Lambda<0, RAJA::Segs<0>> +// > +//>, RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::omp_target_parallel_for_exec_nt, diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp index cf633098a9..d836e1463f 100644 --- a/test/include/RAJA_test-tensor.hpp +++ b/test/include/RAJA_test-tensor.hpp @@ -87,7 +87,9 @@ struct TensorTestHelper void exec(BODY const &body){ hipDeviceSynchronize(); - RAJA::forall>(RAJA::RangeSegment(0,64), + static constexpr int warp_size = RAJA::policy::hip::device_constants.WARP_SIZE; + + RAJA::forall>(RAJA::RangeSegment(0,warp_size), [=] RAJA_HOST_DEVICE (int ){ body(); }); diff --git a/tpl/camp b/tpl/camp index 0f07de4240..7866dc34a4 160000 --- a/tpl/camp +++ b/tpl/camp @@ -1 +1 @@ -Subproject commit 0f07de4240c42e0b38a8d872a20440cb4b33d9f5 +Subproject commit 7866dc34a4f86b6c58891f6daf9f6da0f89b3d94