Merge branch 'develop' into artv3/nested-loops

LLNL · Sep 1, 2024 · 74099c1 · 74099c1
2 parents 6c21522 + 7c81027
commit 74099c1
Show file tree

Hide file tree

Showing 15 changed files with 59 additions and 76 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -4,7 +4,7 @@ jobs:
   build_docker:
     strategy:
       matrix:
-        target: [gcc12, gcc13, clang13, clang15, rocm5.6, rocm5.6_desul, intel2024, intel2024_debug, intel2024_sycl]
+        target: [gcc12_debug, gcc13, clang13, clang15, rocm5.6, rocm5.6_desul, intel2024, intel2024_debug, intel2024_sycl]
     runs-on: ubuntu-latest
     steps:
     - run: |

diff --git a/Dockerfile b/Dockerfile
@@ -17,22 +17,24 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    make clean
 
 FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-22.04 AS gcc12
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
-    make -j 16 &&\
-    ctest -T test --output-on-failure
+    make -j 6 &&\
+    ctest -T test --output-on-failure && \
+    make clean
 
 FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-22.04 AS gcc12_debug
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
-    make -j 6 &&\
+    make -j 16 &&\
     ctest -T test --output-on-failure
 
 FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-22.04 AS gcc12_desul
@@ -41,7 +43,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    make clean
 
 FROM ghcr.io/llnl/radiuss:gcc-13-ubuntu-22.04 AS gcc13
 ENV GTEST_COLOR=1
@@ -65,7 +68,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug  -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    make clean
 
 FROM ghcr.io/llnl/radiuss:clang-15-ubuntu-22.04 AS clang15
 ENV GTEST_COLOR=1
@@ -81,7 +85,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    make clean
 
 ## Test run failure in RAJA launch tests with new reducer interface.
 ## Need to figure out best way to handle that.

diff --git a/README.md b/README.md
@@ -94,7 +94,10 @@ The [**RAJA Performance Suite**](https://github.com/LLNL/RAJAPerf) contains
 a collection of loop kernels implemented in multiple RAJA and non-RAJA
 variants. We use it to monitor and assess RAJA performance on different
 platforms using a variety of compilers. Many major compiler vendors use the
-Suite to improve their support of abstractions like RAJA. 
+Suite to improve their support of abstractions like RAJA. **The RAJA
+Performance Suite is an excellent source of examples of RAJA usage where you
+can compare RAJA and non-RAJA variants of a variety of different kernels and
+RAJA back-ends.**
 
 The [**RAJA Proxies**](https://github.com/LLNL/RAJAProxies) repository 
 contains RAJA versions of several important HPC proxy applications.

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -4,8 +4,10 @@ jobs:
     matrix:
       shared:
         SHARED_ARGS: '-DBUILD_SHARED_LIBS=On'
-      static:
-        SHARED_ARGS: '-DBUILD_SHARED_LIBS=Off'
+###########################
+## Windows static build is run on GitHub Actions
+##    static:
+##      SHARED_ARGS: '-DBUILD_SHARED_LIBS=Off'
   pool:
     vmImage: 'windows-2019'
   variables:
@@ -36,8 +38,8 @@ jobs:
     matrix: 
       gcc11:
         docker_target: gcc11
-      gcc12_debug:
-        docker_target: gcc12_debug
+      gcc12:
+        docker_target: gcc12
       gcc12_desul:
         docker_target: gcc12_desul
       clang14_debug:

diff --git a/docs/conf.py b/docs/conf.py
@@ -60,7 +60,8 @@
     'sphinx.ext.doctest',
     'sphinx.ext.todo',
     'sphinx.ext.coverage',
-    'sphinx.ext.mathjax'
+    'sphinx.ext.mathjax',
+    'sphinxcontrib.jquery'
 ]
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst
@@ -28,7 +28,7 @@ in :ref:`configopt-label`.
 To build RAJA and use its most basic features, you will need:
 
 - C++ compiler with C++14 support
-- `CMake <https://cmake.org/>`_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise.
+- `CMake <https://cmake.org/>`_ version 3.23 or greater.
 
 
 ==================
@@ -371,3 +371,11 @@ be located in the ``<build-dir>/test`` directory.
 For an overview of all the main RAJA features, see :ref:`features-label`.
 A full tutorial with a variety of examples showing how to use RAJA features
 can be found in :ref:`tutorial-label`.
+
+.. important:: The `RAJA Performance Suite <https://github.com/LLNL/RAJAPerf>`
+               is an excellent source of RAJA usage examples. The Suite
+               contains many numerical kernels, each of which is implemented
+               in a variety of RAJA and non-RAJA variants in OpenMP, CUDA, HIP,
+               SYCL, etc. Comparing different variants of these kernels is
+               instructive to understand how to use RAJA features and how they
+               work.
diff --git a/docs/sphinx/user_guide/tutorial.rst b/docs/sphinx/user_guide/tutorial.rst
@@ -449,3 +449,11 @@ Other RAJA Features and Usage Examples
 
    tutorial/halo-exchange.rst
    tutorial/matrix_multiply.rst
+
+.. important:: The `RAJA Performance Suite <https://github.com/LLNL/RAJAPerf>`
+               is an excellent source of RAJA usage examples. The Suite
+               contains many numerical kernels, each of which is implemented
+               in a variety of RAJA and non-RAJA variants in OpenMP, CUDA, HIP,
+               SYCL, etc. Comparing different variants of these kernels is
+               instructive to understand how to use RAJA features and how they
+               work.
diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp
@@ -38,11 +38,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise: vector dot product...\n";
 
-#if defined(RAJA_ENABLE_SYCL)
-  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
-  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
-#endif
-
 //
 // Define vector length
 //

diff --git a/exercises/dot-product_solution.cpp b/exercises/dot-product_solution.cpp
@@ -40,11 +40,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise: vector dot product...\n";
 
-#if defined(RAJA_ENABLE_SYCL)
-  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
-  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
-#endif
-
 //
 // Define vector length
 //
@@ -113,7 +108,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=] (int i) { 
     ompdot += a[i] * b[i]; 
-  });    
+  }); 
 
   dot = ompdot.get();
   // _rajaomp_dotprod_end

diff --git a/exercises/vector-addition.cpp b/exercises/vector-addition.cpp
@@ -62,11 +62,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise: RAJA Vector Addition...\n";
 
-#if defined(RAJA_ENABLE_SYCL)
-  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
-  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
-#endif
-
 //
 // Define vector length
 //

diff --git a/exercises/vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp
@@ -62,11 +62,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n\nExercise: RAJA Vector Addition...\n";
 
-#if defined(RAJA_ENABLE_SYCL)
-  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
-  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
-#endif
-
 //
 // Define vector length
 //
@@ -179,7 +174,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
       c[i] = a[i] + b[i]; 
     }
-  );    
+  ); 
   // _rajaomp_vector_add_end
 
   checkResult(c, c_ref, N);

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
@@ -209,7 +209,8 @@ void launch_global_fcn_fixed(BODY body_in)
 }
 
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
+void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -298,7 +299,7 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 

diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -64,10 +64,6 @@ extern syclInfo tl_status;
 
 extern std::unordered_map<cl::sycl::queue, bool> g_queue_info_map;
 
-void setQueue(camp::resources::Resource* q);
-
-cl::sycl::queue* getQueue();
-
 }  // namespace detail
 
 //! Allocator for pinned memory for use in basic_mempool
@@ -77,7 +73,7 @@ struct PinnedAllocator {
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ptr = ::sycl::malloc_host(nbytes, *q);
     return ptr;
   }
@@ -86,7 +82,7 @@ struct PinnedAllocator {
   // Will throw if ptr is not in q's context
   bool free(void* ptr)
   {
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ::sycl::free(ptr, *q);
     return true;
   }
@@ -99,7 +95,7 @@ struct DeviceAllocator {
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ptr = ::sycl::malloc_device(nbytes, *q);
     return ptr;
   }
@@ -108,7 +104,7 @@ struct DeviceAllocator {
   // Will throw if ptr is not in q's context
   bool free(void* ptr)
   {
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ::sycl::free(ptr, *q);
     return true;
   }
@@ -122,7 +118,7 @@ struct DeviceZeroedAllocator {
   void* malloc(size_t nbytes)
   {
     void* ptr;
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ptr = ::sycl::malloc_device(nbytes, *q);
     q->memset(ptr, 0, nbytes);
     return ptr;
@@ -132,7 +128,7 @@ struct DeviceZeroedAllocator {
   // Will throw if ptr is not in q's context
   bool free(void* ptr)
   {
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
     ::sycl::free(ptr, *q);
     return true;
   }

diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
@@ -107,12 +107,8 @@ struct Reduce_Data
   Reduce_Data(T initValue, T identityValue, Offload_Info &info)
       : value(initValue)
   {
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
-      camp::resources::Resource res = camp::resources::Sycl();
-      q = res.get<camp::resources::Sycl>().get_queue();
-    } 
 
     device = reinterpret_cast<T *>(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
     host = reinterpret_cast<T *>(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
@@ -140,7 +136,7 @@ struct Reduce_Data
   //! transfers from the host to the device -- exit() is called upon failure
   RAJA_INLINE void hostToDevice(Offload_Info &info)
   {
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
     if(!q) {
       camp::resources::Resource res = camp::resources::Sycl();
@@ -158,7 +154,7 @@ struct Reduce_Data
   //! transfers from the device to the host -- exit() is called upon failure
   RAJA_INLINE void deviceToHost(Offload_Info &info)
   {
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
     if(!q) {
       camp::resources::Resource res = camp::resources::Sycl();
@@ -169,18 +165,15 @@ struct Reduce_Data
     auto e = q->memcpy(reinterpret_cast<void *>(host),
                        reinterpret_cast<void *>(device),
                        sycl::MaxNumTeams * sizeof(T));
-    
+
     e.wait();
   }
 
   //! frees all data from the offload information passed
   RAJA_INLINE void cleanup(Offload_Info &info)
   {
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-    if(!q) {
-      camp::resources::Resource res = camp::resources::Sycl();
-      q = res.get<camp::resources::Sycl>().get_queue();
-    }
+    cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
+
     if (device) {
       cl::sycl::free(reinterpret_cast<void *>(device), *q);
       device = nullptr;

diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp
@@ -48,24 +48,10 @@ syclInfo tl_status;
 #pragma omp threadprivate(tl_status)
 #endif
 
-camp::resources::Resource* app_q = NULL;
-
-void setQueue(camp::resources::Resource* qu) {
-  app_q = qu;
-}
-
 //! State of raja sycl queue synchronization for sycl reducer objects
 std::unordered_map<cl::sycl::queue, bool> g_queue_info_map{
     {cl::sycl::queue(), true}};
 
-cl::sycl::queue* getQueue() {
-  if (app_q != NULL) {
-    return app_q->get<camp::resources::Sycl>().get_queue();
-  }
-
-  return NULL;
-}
-
 }  // namespace detail
 
 }  // namespace sycl