diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 49ceddf0a..b8cd08cf7 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -17,6 +17,8 @@ std::vector filter(const std::vector& vec, Predicate pred) { auto getOpType = [](const std::string& str) { if (str == "nop") { + return mscclpp::OperationType::THREADBLOCK_BARRIER; + } else if (str == "barrier") { return mscclpp::OperationType::BARRIER; } else if (str == "put") { return mscclpp::OperationType::PUT; @@ -456,6 +458,12 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse operation.size = this->getNChunkSize(rank, this->inputSize, this->outputSize, (uint32_t)op["cnt"], chunkIndexes); } + if (op.contains("barrier_id")) { + operation.deviceSyncerIndex = op["barrier_id"]; + } + if (op.contains("nthread_blocks")) { + operation.nThreadBlocks = op["nthread_blocks"]; + } ops.push_back(operation); } this->operations[rank].push_back(ops); diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 5fb2dbf90..00073e185 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -30,6 +30,7 @@ enum class ChannelType : uint8_t { // NOTE(chhwang): any modification here requires corresponding updates in `tools/npkit/npkit_trace_generator.py`. enum class OperationType : uint8_t { + THREADBLOCK_BARRIER, BARRIER, PUT, PUT_PACKET, diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 6bf7462f4..193651c47 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -530,7 +530,9 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu event_buffer, &event_buffer_head); #endif - if (op.type == OperationType::BARRIER) { + if (op.type == OperationType::THREADBLOCK_BARRIER) { + __syncthreads(); + } else if (op.type == OperationType::BARRIER) { int nThreadBlocks = op.nThreadBlocks; int syncStateIndex = op.deviceSyncerIndex; deviceSyncers[syncStateIndex].sync(nThreadBlocks); diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py index 9a5b88b44..5547bd771 100644 --- a/tools/npkit/npkit_trace_generator.py +++ b/tools/npkit/npkit_trace_generator.py @@ -11,6 +11,7 @@ def parse_npkit_event_header(npkit_event_header_path): npkit_event_def = {"id_to_type": {}, "type_to_id": {}} executor_ops = [ + "THREADBLOCK_BARRIER", "BARRIER", "PUT", "PUT_PACKET",