Skip to content

Commit

Permalink
fifo should be correct now as well.
Browse files Browse the repository at this point in the history
  • Loading branch information
eschouks committed Oct 6, 2023
1 parent 0217f35 commit 939c97a
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 10 deletions.
33 changes: 26 additions & 7 deletions include/mscclpp/fifo_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#ifndef MSCCLPP_FIFO_DEVICE_HPP_
#define MSCCLPP_FIFO_DEVICE_HPP_

#include <cuda/atomic>

#include "poll.hpp"

namespace mscclpp {
Expand Down Expand Up @@ -38,23 +40,36 @@ struct FifoDeviceHandle {
/// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
/// @return The new head of the FIFO.
__forceinline__ __device__ uint64_t push(ProxyTrigger trigger, int64_t maxSpinCount = 1000000) {
uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->head, 1);
uint64_t curFifoHead =
cuda::atomic_ref<uint64_t, cuda::thread_scope_device>{*this->head}.fetch_add(1, cuda::memory_order_relaxed);

// make the last bit intentionally non-zero so that we can safely poll. Don't worry, we will change it back in host
// side
trigger.snd ^= ((uint64_t)1 << (uint64_t)63);

// Only one of two conditions need to be met to proceed. Either the tail has advanced enough or where we need to
// write to is 0. However, the first condition is faster to check since the tail is flushed periodically anyways but
// for the second condition we need to read CPU memory.
// As volatile access is slow, we first check using the bare pointer and then use the volatile pointer if the
// As atomic access is slow, we first check using the bare pointer and then use the atomic load if the
// condition is not met.
if (curFifoHead >= size + *(this->tailReplica)) {
OR_POLL_MAYBE_JAILBREAK(curFifoHead >= size + *((volatile uint64_t*)this->tailReplica),
*(volatile uint64_t*)&this->triggers[curFifoHead % size] != 0, maxSpinCount);
OR_POLL_MAYBE_JAILBREAK(
(curFifoHead >= size + cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*this->tailReplica}.load(
cuda::memory_order_relaxed)),
(cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{this->triggers[curFifoHead % size].fst}.load(
cuda::memory_order_relaxed) != 0),
maxSpinCount);
}

ProxyTrigger* triggerPtr = (ProxyTrigger*)&(this->triggers[curFifoHead % size]);
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));

// only the fst needs to atomically stored thanks to memory_order_release. On the host side we only need to
// atomically load the fst. If fst is non-zero, snd is guaranteed to be the right value
triggerPtr->snd = trigger.snd;
cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{triggerPtr->fst}.store(trigger.fst,
cuda::memory_order_release);

// asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
return curFifoHead;
}

Expand All @@ -65,8 +80,12 @@ struct FifoDeviceHandle {
__forceinline__ __device__ void sync(uint64_t curFifoHead, int64_t maxSpinCount = 1000000) {
// Same as push but in this case checking the fist condition is probably faster since for tail to be pushed we need
// to wait for cudaMemcpy to be done.
OR_POLL_MAYBE_JAILBREAK(*(volatile uint64_t*)&(this->triggers[curFifoHead % size]) != 0,
*(volatile uint64_t*)(this->tailReplica) <= curFifoHead, maxSpinCount);
OR_POLL_MAYBE_JAILBREAK(
(curFifoHead >=
cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*this->tailReplica}.load(cuda::memory_order_relaxed)),
(cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{this->triggers[curFifoHead % size].fst}.load(
cuda::memory_order_relaxed) != 0),
maxSpinCount);
}
#endif // __CUDACC__

Expand Down
7 changes: 4 additions & 3 deletions src/fifo.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include <cuda/atomic>
#include <mscclpp/cuda_utils.hpp>
#include <mscclpp/fifo.hpp>

Expand Down Expand Up @@ -40,9 +41,9 @@ MSCCLPP_API_CPP Fifo::~Fifo() = default;

MSCCLPP_API_CPP ProxyTrigger Fifo::poll() {
ProxyTrigger trigger;
volatile ProxyTrigger* ptr =
reinterpret_cast<volatile ProxyTrigger*>(&pimpl->triggers.get()[pimpl->hostTail % pimpl->size]);
trigger.fst = ptr->fst;
ProxyTrigger* ptr = &pimpl->triggers.get()[pimpl->hostTail % pimpl->size];
// we are loading fst first. if fst is non-zero then snd is also valid
trigger.fst = cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{ptr->fst}.load(cuda::memory_order_acquire);
trigger.snd = ptr->snd;
return trigger;
}
Expand Down

0 comments on commit 939c97a

Please sign in to comment.