Skip to content

Commit

Permalink
Misc updates (#95)
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Jun 12, 2023
1 parent 798631b commit 5a4885c
Show file tree
Hide file tree
Showing 16 changed files with 102 additions and 101 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ steps:
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/compat/lib.real
mkdir build && cd build
/tmp/cmake-3.26.4-linux-x86_64/bin/cmake ..
/tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down
2 changes: 1 addition & 1 deletion .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/compat/lib.real
mkdir build && cd build
/tmp/cmake-3.26.4-linux-x86_64/bin/cmake ..
/tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
git config --global --add safe.directory /__w/mscclpp/mscclpp
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake .
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .
make -j
- name: Perform CodeQL Analysis
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ jobs:
uses: actions/checkout@v3

- name: Install ClangFormat
run: sudo apt-get install -y clang-format-12
run: sudo apt-get install -y clang-format

- name: Run cpplint
run: |
CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*")
PYTHONCPPSOURCES=$(find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
clang-format-12 -style=file --verbose --Werror --dry-run ${CPPSOURCES}
clang-format-12 --dry-run ${PYTHONCPPSOURCES}
clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
clang-format --dry-run ${PYTHONCPPSOURCES}
spelling:
runs-on: ubuntu-20.04
Expand Down
17 changes: 16 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ cmake_minimum_required(VERSION 3.26)
project(mscclpp LANGUAGES CUDA CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90")

option(ENABLE_TRACE "Enable tracing" OFF)
option(USE_MPI_FOR_TESTS "Use MPI for tests" ON)
Expand All @@ -16,6 +15,22 @@ endif()
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

find_package(CUDAToolkit REQUIRED)

# Set CUDA flags based on the detected CUDA version
if(CUDAToolkit_FOUND)
if(CUDAToolkit_VERSION_MAJOR LESS 11)
message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}")
endif()

if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_80,code=sm_80")
endif()

if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_90,code=sm_90")
endif()
endif()

find_package(IBVerbs REQUIRED)
find_package(NUMA REQUIRED)
if(USE_MPI_FOR_TESTS)
Expand Down
8 changes: 0 additions & 8 deletions TODO.md

This file was deleted.

48 changes: 7 additions & 41 deletions include/mscclpp/channel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,31 +157,19 @@ struct DeviceChannel {
DeviceProxyFifo fifo_;
};

class DeviceChannelService;

inline ProxyHandler makeChannelProxyHandler(DeviceChannelService& channelService);

class DeviceChannelService {
public:
DeviceChannelService(Communicator& communicator);

ChannelId addChannel(std::shared_ptr<Connection> connection) {
channels_.push_back(Channel(communicator_, connection));
return channels_.size() - 1;
}
ChannelId addChannel(std::shared_ptr<Connection> connection);

MemoryId addMemory(RegisteredMemory memory) {
memories_.push_back(memory);
return memories_.size() - 1;
}
MemoryId addMemory(RegisteredMemory memory);

Channel channel(ChannelId id) { return channels_[id]; }
DeviceChannel deviceChannel(ChannelId id) {
return DeviceChannel(id, channels_[id].epoch().deviceHandle(), proxy_.fifo().deviceFifo());
}
Channel channel(ChannelId id) const;
DeviceChannel deviceChannel(ChannelId id);

void startProxy() { proxy_.start(); }
void stopProxy() { proxy_.stop(); }
void startProxy();
void stopProxy();

private:
Communicator& communicator_;
Expand All @@ -192,29 +180,7 @@ class DeviceChannelService {

void bindThread();

ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw) {
ChannelTrigger* trigger = reinterpret_cast<ChannelTrigger*>(&triggerRaw);
Channel& channel = channels_[trigger->fields.chanId];

auto result = ProxyHandlerResult::Continue;

if (trigger->fields.type & TriggerData) {
RegisteredMemory& dst = memories_[trigger->fields.dstMemoryId];
RegisteredMemory& src = memories_[trigger->fields.srcMemoryId];
channel.connection().write(dst, trigger->fields.dstOffset, src, trigger->fields.srcOffset, trigger->fields.size);
}

if (trigger->fields.type & TriggerFlag) {
channel.epoch().signal();
}

if (trigger->fields.type & TriggerSync) {
channel.connection().flush();
result = ProxyHandlerResult::FlushFifoTailAndContinue;
}

return result;
}
ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw);
};

struct SimpleDeviceChannel {
Expand Down
14 changes: 2 additions & 12 deletions include/mscclpp/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,8 @@ class BaseBootstrap {
virtual void allGather(void* allData, int size) = 0;
virtual void barrier() = 0;

// TODO: move implementations of these helpers out of this header
void send(const std::vector<char>& data, int peer, int tag) {
size_t size = data.size();
send((void*)&size, sizeof(size_t), peer, tag);
send((void*)data.data(), data.size(), peer, tag + 1);
}
void recv(std::vector<char>& data, int peer, int tag) {
size_t size;
recv((void*)&size, sizeof(size_t), peer, tag);
data.resize(size);
recv((void*)data.data(), data.size(), peer, tag + 1);
}
void send(const std::vector<char>& data, int peer, int tag);
void recv(std::vector<char>& data, int peer, int tag);
};

class Bootstrap : public BaseBootstrap {
Expand Down
2 changes: 1 addition & 1 deletion include/mscclpp/cuda_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,4 @@ void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMe

} // namespace mscclpp

#endif // MSCCLPP_CUDA_UTILS_HPP_
#endif // MSCCLPP_CUDA_UTILS_HPP_
13 changes: 13 additions & 0 deletions src/bootstrap/bootstrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,19 @@ struct ExtInfo {
mscclppSocketAddress extAddressListen;
};

MSCCLPP_API_CPP void BaseBootstrap::send(const std::vector<char>& data, int peer, int tag) {
size_t size = data.size();
send((void*)&size, sizeof(size_t), peer, tag);
send((void*)data.data(), data.size(), peer, tag + 1);
}

MSCCLPP_API_CPP void BaseBootstrap::recv(std::vector<char>& data, int peer, int tag) {
size_t size;
recv((void*)&size, sizeof(size_t), peer, tag);
data.resize(size);
recv((void*)data.data(), data.size(), peer, tag + 1);
}

struct UniqueIdInternal {
uint64_t magic;
union mscclppSocketAddress addr;
Expand Down
44 changes: 44 additions & 0 deletions src/channel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,56 @@ MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communi
deviceNumaNode = getDeviceNumaNode(cudaDevice);
}

MSCCLPP_API_CPP ChannelId DeviceChannelService::addChannel(std::shared_ptr<Connection> connection) {
channels_.push_back(Channel(communicator_, connection));
return channels_.size() - 1;
}

MSCCLPP_API_CPP MemoryId DeviceChannelService::addMemory(RegisteredMemory memory) {
memories_.push_back(memory);
return memories_.size() - 1;
}

MSCCLPP_API_CPP Channel DeviceChannelService::channel(ChannelId id) const { return channels_[id]; }

MSCCLPP_API_CPP DeviceChannel DeviceChannelService::deviceChannel(ChannelId id) {
return DeviceChannel(id, channels_[id].epoch().deviceHandle(), proxy_.fifo().deviceFifo());
}

MSCCLPP_API_CPP void DeviceChannelService::startProxy() { proxy_.start(); }

MSCCLPP_API_CPP void DeviceChannelService::stopProxy() { proxy_.stop(); }

MSCCLPP_API_CPP void DeviceChannelService::bindThread() {
if (deviceNumaNode >= 0) {
numaBind(deviceNumaNode);
INFO(MSCCLPP_INIT, "NUMA node of DeviceChannelService proxy thread is set to %d", deviceNumaNode);
}
}

ProxyHandlerResult DeviceChannelService::handleTrigger(ProxyTrigger triggerRaw) {
ChannelTrigger* trigger = reinterpret_cast<ChannelTrigger*>(&triggerRaw);
Channel& channel = channels_[trigger->fields.chanId];

auto result = ProxyHandlerResult::Continue;

if (trigger->fields.type & TriggerData) {
RegisteredMemory& dst = memories_[trigger->fields.dstMemoryId];
RegisteredMemory& src = memories_[trigger->fields.srcMemoryId];
channel.connection().write(dst, trigger->fields.dstOffset, src, trigger->fields.srcOffset, trigger->fields.size);
}

if (trigger->fields.type & TriggerFlag) {
channel.epoch().signal();
}

if (trigger->fields.type & TriggerSync) {
channel.connection().flush();
result = ProxyHandlerResult::FlushFifoTailAndContinue;
}

return result;
}

} // namespace channel
} // namespace mscclpp
14 changes: 0 additions & 14 deletions src/include/basic_proxy_handler.hpp

This file was deleted.

9 changes: 1 addition & 8 deletions src/include/registered_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,7 @@ struct RegisteredMemory::Impl {
Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl);
Impl(const std::vector<char>& data);

TransportInfo& getTransportInfo(Transport transport) {
for (auto& entry : transportInfos) {
if (entry.transport == transport) {
return entry;
}
}
throw Error("Transport data not found", ErrorCode::InternalError);
}
const TransportInfo& getTransportInfo(Transport transport) const;
};

} // namespace mscclpp
Expand Down
9 changes: 9 additions & 0 deletions src/registered_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,13 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
}
}

const TransportInfo& RegisteredMemory::Impl::getTransportInfo(Transport transport) const {
for (auto& entry : transportInfos) {
if (entry.transport == transport) {
return entry;
}
}
throw Error("Transport data not found", ErrorCode::InternalError);
}

} // namespace mscclpp
6 changes: 1 addition & 5 deletions test/mp_unit_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -671,12 +671,8 @@ class ChannelOneToOneTest : public CommunicatorTestBase {
mscclpp::channel::ChannelId cid = channelService->addChannel(connections[r]);
communicator->setup();

// TODO: enable this when we support out-of-place
// devChannels.emplace_back(channelService->deviceChannel(cid),
// channelService->addMemory(remoteMemory), channelService->addMemory(sendMemory),
// remoteMemory.data(), sendMemory.data(), tmpBuff);
devChannels.emplace_back(channelService->deviceChannel(cid), channelService->addMemory(remoteMemory),
channelService->addMemory(sendMemory), remoteMemory.data(), sendMemory.data());
channelService->addMemory(sendMemory), remoteMemory.data(), sendMemory.data(), tmpBuff);
}
}

Expand Down
7 changes: 2 additions & 5 deletions test/mscclpp-test/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,8 @@ double allreduceTime(int worldSize, double value, int average) {
double accumulator = value;

if (average != 0) {
MPI_Op op = average == 1 ? MPI_SUM
: average == 2 ? MPI_MIN
: average == 3 ? MPI_MAX
: average == 4 ? MPI_SUM
: MPI_Op();
MPI_Op op =
average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : average == 3 ? MPI_MAX : average == 4 ? MPI_SUM : MPI_Op();
MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator, 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
}

Expand Down

0 comments on commit 5a4885c

Please sign in to comment.