diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index e155555c6a..b4ef1d29b8 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -253,6 +253,21 @@ namespace RAJA { #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__) #define RAJA_CUDA_ACTIVE + +// Compiler numeric identifiers +#define RAJA_CUDA_COMPILER_NVCC -1 +#define RAJA_CUDA_COMPILER_CLANG 1 +#define RAJA_CUDA_COMPILER_NVCXX 2 + +#if defined(__NVCOMPILER_CUDA__) // it's NVC++ +#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_NVCXX +#elif defined(__NVCC__) // it's NVCC +#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_NVCC +#elif defined(__clang__) && defined(__CUDA__) // clang cuda +#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_CLANG +#else +#error Unknown compiler is claiming to be a CUDA compiler +#endif #endif // RAJA_ENABLE_CUDA && __CUDACC__ #if defined(RAJA_ENABLE_HIP) && defined(__HIPCC__) diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 5a2db973b7..500bdf27cf 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -50,6 +50,47 @@ #include "RAJA/policy/cuda/policy.hpp" #include "RAJA/policy/cuda/raja_cudaerrchk.hpp" +#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda +#pragma push_macro("__MAKE_SYNC_SHUFFLES") +#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ + __Mask, __Type) \ + inline __device__ int __FnName(unsigned int __mask, int __val, \ + __Type __offset, int __width = warpSize) { \ + return __IntIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ long long __FnName(unsigned int __mask, long long __val, \ + __Type __offset, \ + int __width = warpSize) { \ + struct __Bits { \ + int __a, __b; \ + }; \ + _Static_assert(sizeof(__val) == sizeof(__Bits)); \ + _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ + __Bits __tmp; \ + memcpy(&__tmp, &__val, sizeof(__val)); \ + __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ + __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ + long long __ret; \ + memcpy(&__ret, &__tmp, sizeof(__tmp)); \ + return __ret; \ + } \ + inline __device__ unsigned long long __FnName( \ + unsigned int __mask, unsigned long long __val, __Type __offset, \ + int __width = warpSize) { \ + return static_cast(::__FnName( \ + __mask, static_cast(__val), __offset, __width)); \ + } + +__MAKE_SYNC_SHUFFLES(__shfl_sync_fixed, __nvvm_shfl_sync_idx_i32, + __nvvm_shfl_sync_idx_f32, 0x1f, int); +// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= +// maxLane. +__MAKE_SYNC_SHUFFLES(__shfl_xor_sync_fixed, __nvvm_shfl_sync_bfly_i32, + __nvvm_shfl_sync_bfly_f32, 0x1f, int); +#pragma pop_macro("__MAKE_SYNC_SHUFFLES") +#endif + namespace RAJA { @@ -243,7 +284,11 @@ RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync(long long var, int la template <> RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync(unsigned long long var, int laneMask) { +#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda + return ::__shfl_xor_sync_fixed(0xffffffffu, var, laneMask); +#else return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +#endif } template <> @@ -310,7 +355,11 @@ RAJA_DEVICE RAJA_INLINE long long shfl_sync(long long var, int srcLan template <> RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync(unsigned long long var, int srcLane) { +#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda + return ::__shfl_sync_fixed(0xffffffffu, var, srcLane); +#else return ::__shfl_sync(0xffffffffu, var, srcLane); +#endif } template <> @@ -389,7 +438,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) T temp = val; for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - T rhs = __shfl_xor_sync(0xffffffff, temp, i); + T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } diff --git a/test/integration/plugin/CMakeLists.txt b/test/integration/plugin/CMakeLists.txt index da41e2a0bd..a511e49cfa 100644 --- a/test/integration/plugin/CMakeLists.txt +++ b/test/integration/plugin/CMakeLists.txt @@ -39,13 +39,21 @@ foreach( BACKEND ${PLUGIN_BACKENDS} ) configure_file( test-plugin-workgroup.cpp.in test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp ) + if(${BACKEND} STREQUAL "Hip") + raja_add_test( NAME test-plugin-workgroup-Known-Hip-Failure-${DISPATCHER}-${BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp + plugin_to_test.cpp ) - raja_add_test( NAME test-plugin-workgroup-${DISPATCHER}-${BACKEND} - SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp - plugin_to_test.cpp ) + target_include_directories(test-plugin-workgroup-Known-Hip-Failure-${DISPATCHER}-${BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + else() + raja_add_test( NAME test-plugin-workgroup-${DISPATCHER}-${BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp + plugin_to_test.cpp ) - target_include_directories(test-plugin-workgroup-${DISPATCHER}-${BACKEND}.exe - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + target_include_directories(test-plugin-workgroup-${DISPATCHER}-${BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endif() endforeach() endforeach() diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt index aa06e5cd28..ef26aff5a9 100644 --- a/test/unit/algorithm/CMakeLists.txt +++ b/test/unit/algorithm/CMakeLists.txt @@ -64,7 +64,8 @@ macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_ test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp ) #Some tests are known to fail for Hip, mark those tests (Will not be run in Gitlab CI) - if(${SORT_BACKEND} STREQUAL "Hip" AND (${UTIL_SORT} STREQUAL "Heap" OR + if((${SORT_BACKEND} STREQUAL "Hip" OR RAJA_ENABLE_CLANG_CUDA) + AND (${UTIL_SORT} STREQUAL "Heap" OR ${UTIL_SORT} STREQUAL "Insertion" OR ${UTIL_SORT} STREQUAL "Intro")) raja_add_test( NAME test-algorithm-util-sort-Known-Hip-Failure-${UTIL_SORT}-${SORT_BACKEND}