diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp index f071717c0..a57bf1c6d 100644 --- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp +++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp @@ -26,12 +26,8 @@ #include #include -#include -#include #include #include -#include -#include #include namespace rmm::mr::detail { @@ -259,23 +255,6 @@ class stream_ordered_memory_resource : public crtp, public device_ } private: - /** - * @brief RAII wrapper for a CUDA event. - */ - struct event_wrapper { - event_wrapper() - { - RMM_ASSERT_CUDA_SUCCESS(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - } - ~event_wrapper() { RMM_ASSERT_CUDA_SUCCESS(cudaEventDestroy(event)); } - cudaEvent_t event{}; - - event_wrapper(event_wrapper const&) = delete; - event_wrapper& operator=(event_wrapper const&) = delete; - event_wrapper(event_wrapper&&) noexcept = delete; - event_wrapper& operator=(event_wrapper&&) = delete; - }; - /** * @brief get a unique CUDA event (possibly new) associated with `stream` * @@ -289,17 +268,20 @@ class stream_ordered_memory_resource : public crtp, public device_ stream_event_pair get_event(cuda_stream_view stream) { if (stream.is_per_thread_default()) { - // Create a thread-local shared event wrapper for each device. Shared pointers in the thread - // and in each MR instance ensure the wrappers are destroyed only after all are finished - // with them. - thread_local std::vector> events_tls( - rmm::get_num_cuda_devices()); - auto event = [&, device_id = this->device_id_]() { - if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; } - - auto event = std::make_shared(); - this->default_stream_events.insert(event); - return (events_tls[device_id.value()] = std::move(event))->event; + // Create a thread-local event for each device. These events are + // deliberately leaked since the destructor needs to call into + // the CUDA runtime and thread_local destructors (can) run below + // main: it is undefined behaviour to call into the CUDA + // runtime below main. + thread_local std::vector events_tls(rmm::get_num_cuda_devices()); + auto event = [device_id = this->device_id_]() { + auto& e = events_tls[device_id.value()]; + if (!e) { + // These events are deliberately not destructed and therefore live until + // program exit. + RMM_ASSERT_CUDA_SUCCESS(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); + } + return e; }(); return stream_event_pair{stream.value(), event}; } @@ -505,10 +487,6 @@ class stream_ordered_memory_resource : public crtp, public device_ // bidirectional mapping between non-default streams and events std::unordered_map stream_events_; - // shared pointers to events keeps the events alive as long as either the thread that created - // them or the MR that is using them exists. - std::set> default_stream_events; - std::mutex mtx_; // mutex for thread-safe access rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()};