Deliberately leak PTDS thread_local events in stream ordered mr (#1375)

An object with `thread_local` modifier has thread storage duration, its destructor (if it exists) will after the thread exits, which, on the main thread, is below `main` (https://eel.is/c++draft/basic.start.term). The CUDA runtime sets up (when the first call into the runtime is made) a teardown of the driver that runs `atexit`. Although [basic.start.term#5](https://eel.is/c++draft/basic.start.term#5) provides guarantees on the order in which these destructors are called (thread storage duration objects are destructed _before_ any `atexit` handlers run), it appears that gnu libstdc++ does not always implement this correctly (if not compiled with `_GLIBCXX_HAVE___CXA_THREAD_ATEXIT`). Moreover (possibly consequently) it is considered undefined behaviour to call into the CUDA runtime below `main`. Hence, we cannot call `cudaEventDestroy` to deallocate our `thread_local` events. Since there are a finite number of these event (`ndevices * nparticipating_threads`), rather than attempting to destroy them we choose to leak them, thus avoiding any sequencing problems. - Closes #1371 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Mark Harris (https://github.com/harrism) - Jake Hemstad (https://github.com/jrhemstad) URL: #1375
rapidsai · Nov 8, 2023 · d407fd3 · d407fd3
1 parent 28abb10
commit d407fd3
Showing 1 changed file with 14 additions and 36 deletions.
diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -26,12 +26,8 @@
 #include <fmt/core.h>
 
 #include <cstddef>
-#include <functional>
-#include <limits>
 #include <map>
 #include <mutex>
-#include <set>
-#include <thread>
 #include <unordered_map>
 
 namespace rmm::mr::detail {
@@ -259,23 +255,6 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   }
 
  private:
-  /**
-   * @brief RAII wrapper for a CUDA event.
-   */
-  struct event_wrapper {
-    event_wrapper()
-    {
-      RMM_ASSERT_CUDA_SUCCESS(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    }
-    ~event_wrapper() { RMM_ASSERT_CUDA_SUCCESS(cudaEventDestroy(event)); }
-    cudaEvent_t event{};
-
-    event_wrapper(event_wrapper const&)            = delete;
-    event_wrapper& operator=(event_wrapper const&) = delete;
-    event_wrapper(event_wrapper&&) noexcept        = delete;
-    event_wrapper& operator=(event_wrapper&&)      = delete;
-  };
-
   /**
    * @brief get a unique CUDA event (possibly new) associated with `stream`
    *
@@ -289,17 +268,20 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   stream_event_pair get_event(cuda_stream_view stream)
   {
     if (stream.is_per_thread_default()) {
-      // Create a thread-local shared event wrapper for each device. Shared pointers in the thread
-      // and in each MR instance ensure the wrappers are destroyed only after all are finished
-      // with them.
-      thread_local std::vector<std::shared_ptr<event_wrapper>> events_tls(
-        rmm::get_num_cuda_devices());
-      auto event = [&, device_id = this->device_id_]() {
-        if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; }
-
-        auto event = std::make_shared<event_wrapper>();
-        this->default_stream_events.insert(event);
-        return (events_tls[device_id.value()] = std::move(event))->event;
+      // Create a thread-local event for each device. These events are
+      // deliberately leaked since the destructor needs to call into
+      // the CUDA runtime and thread_local destructors (can) run below
+      // main: it is undefined behaviour to call into the CUDA
+      // runtime below main.
+      thread_local std::vector<cudaEvent_t> events_tls(rmm::get_num_cuda_devices());
+      auto event = [device_id = this->device_id_]() {
+        auto& e = events_tls[device_id.value()];
+        if (!e) {
+          // These events are deliberately not destructed and therefore live until
+          // program exit.
+          RMM_ASSERT_CUDA_SUCCESS(cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+        }
+        return e;
       }();
       return stream_event_pair{stream.value(), event};
     }
@@ -505,10 +487,6 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   // bidirectional mapping between non-default streams and events
   std::unordered_map<cudaStream_t, stream_event_pair> stream_events_;
 
-  // shared pointers to events keeps the events alive as long as either the thread that created
-  // them or the MR that is using them exists.
-  std::set<std::shared_ptr<event_wrapper>> default_stream_events;
-
   std::mutex mtx_;  // mutex for thread-safe access
 
   rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()};