From 6a63062c6cda28cf6c11b12717044c241774c89f Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Mon, 30 Dec 2024 12:47:23 +0100 Subject: [PATCH] TL/CUDA: addressed comments --- src/components/tl/cuda/Makefile.am | 2 +- src/components/tl/cuda/allgather/allgather.c | 2 +- .../tl/cuda/allgather/allgather_linear.c | 2 +- .../tl/cuda/allgatherv/allgatherv.c | 2 +- .../tl/cuda/allgatherv/allgatherv_linear.c | 2 +- src/components/tl/cuda/bcast/bcast.c | 2 +- src/components/tl/cuda/bcast/bcast.h | 2 +- src/components/tl/cuda/bcast/bcast_linear.c | 143 +++++++++--------- .../tl/cuda/reduce_scatter/reduce_scatter.c | 2 +- .../reduce_scatter/reduce_scatter_linear.c | 2 +- .../tl/cuda/reduce_scatterv/reduce_scatterv.c | 2 +- .../reduce_scatterv/reduce_scatterv_linear.c | 2 +- src/components/tl/cuda/tl_cuda.c | 2 +- src/components/tl/cuda/tl_cuda.h | 8 +- src/components/tl/cuda/tl_cuda_coll.c | 2 +- src/components/tl/cuda/tl_cuda_coll.h | 2 +- src/components/tl/cuda/tl_cuda_ring.h | 2 +- src/components/tl/cuda/tl_cuda_team.c | 29 ++-- src/components/tl/cuda/tl_cuda_team_topo.h | 2 +- src/components/tl/ucp/bcast/bcast_knomial.c | 4 +- test/gtest/coll/test_bcast.cc | 2 +- 21 files changed, 111 insertions(+), 107 deletions(-) diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am index 2136821b93..65fb41ca1f 100644 --- a/src/components/tl/cuda/Makefile.am +++ b/src/components/tl/cuda/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) Meta Platforms, Inc. and affiliates. 2022. # diff --git a/src/components/tl/cuda/allgather/allgather.c b/src/components/tl/cuda/allgather/allgather.c index 1e64c0a582..362191b3ac 100644 --- a/src/components/tl/cuda/allgather/allgather.c +++ b/src/components/tl/cuda/allgather/allgather.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c index fefc774628..d0b416257b 100644 --- a/src/components/tl/cuda/allgather/allgather_linear.c +++ b/src/components/tl/cuda/allgather/allgather_linear.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/allgatherv/allgatherv.c b/src/components/tl/cuda/allgatherv/allgatherv.c index 76da65fa65..4a73bbdf08 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv.c +++ b/src/components/tl/cuda/allgatherv/allgatherv.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c index 1f02ad37bd..9a8b5db140 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c +++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c index d687d924a0..954cf86d9f 100644 --- a/src/components/tl/cuda/bcast/bcast.c +++ b/src/components/tl/cuda/bcast/bcast.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/bcast/bcast.h b/src/components/tl/cuda/bcast/bcast.h index 17d07a529b..5810bcc89d 100644 --- a/src/components/tl/cuda/bcast/bcast.h +++ b/src/components/tl/cuda/bcast/bcast.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 8b33512006..9b2915fa39 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -8,45 +8,38 @@ enum { // Barrier setup stages - STAGE_INIT_BAR_ROOT, // Initial stage for the root rank to identify and claim a free barrier - STAGE_FIND_BAR_PEER, // Stage where peer ranks wait while the root rank identifies a free barrier + STAGE_INIT_BAR_ROOT, // Initial stage for the root rank to identify and claim a free barrier + STAGE_FIND_BAR_PEER, // Stage where peer ranks wait while the root rank identifies a free barrier - STAGE_SYNC, // Initialize the barrier and synchronize the segment required for the current task - STAGE_SETUP, // Verify that all ranks are aligned and have reached the barrier + STAGE_SYNC, // Initialize the barrier and synchronize the segment required for the current task + STAGE_SETUP, // Verify that all ranks are aligned and have reached the barrier // Stages specific to the root rank - STAGE_COPY, // Post copy task: copy data block from src to a scratch buffer - STAGE_WAIT_COPY, // The root waits for the completion of its copy operation - STAGE_WAIT_ALL, // The root rank waits until all other ranks have reached the same operational step - STAGE_WAIT_COMPLETION, // The root rank waits for all other ranks to complete the broadcast operation + STAGE_COPY, // Post copy task: copy data block from src to a scratch buffer + STAGE_WAIT_COPY, // The root waits for the completion of its copy operation + STAGE_WAIT_ALL, // The root rank waits until all other ranks have reached the same operational step + STAGE_WAIT_COMPLETION, // The root rank waits for all other ranks to complete the broadcast operation // non-root - STAGE_WAIT_ROOT, // Wait while the root rank writes data to its scratch buffer - STAGE_CLIENT_COPY, // Initiate their own copy tasks after the root's operations - STAGE_CLIENT_COPY_WAIT, // Wait for the completion of the copy operation from the root's scratch buffer - STAGE_CLIENT_WAIT_COMPLETION, // Wait for the completion of algorithm on all ranks, global sync with root + STAGE_WAIT_ROOT, // Wait while the root rank writes data to its scratch buffer + STAGE_CLIENT_COPY, // Initiate their own copy tasks after the root's operations + STAGE_CLIENT_COPY_WAIT, // Wait for the completion of the copy operation from the root's scratch buffer + STAGE_CLIENT_WAIT_COMPLETION, // Wait for the completion of algorithm on all ranks, global sync with root }; -static inline ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task) +static inline ucc_status_t +ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task) { ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_rank_t trank = UCC_TL_TEAM_RANK(team); - ucc_status_t status; set_rank_step(task, trank, 0, 0); // Initialize rank step tracking ucc_memory_cpu_store_fence(); // initiate barrier wait while all ranks set theirs steps to 0 - status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar); - if (ucc_unlikely(status != UCC_OK)) { - goto exit_err; - } - - return UCC_OK; - -exit_err: - return status; + return ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar); } // Tests if setup is complete for a linear broadcast task -static inline ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task) +static inline ucc_status_t +ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task) { ucc_tl_cuda_team_t *team = TASK_TEAM(task); return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar); @@ -85,8 +78,8 @@ static inline ucc_status_t root_find_free_barrier(ucc_tl_cuda_task_t *task) for (i = 0; i < max_concurrent; ++i) { curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i); // try to set user specified tag to mark that this barrier is used by this task - if (ucc_atomic_cswap64(&curr_bar->tag, UCC_TAG_FREE, - task->bcast_linear.key) == UCC_TAG_FREE) { + if (ucc_atomic_cswap64(&curr_bar->tag, UCC_TL_CUDA_TAG_FREE, + task->bcast_linear.key) == UCC_TL_CUDA_TAG_FREE) { ucc_debug("Acquire barrier: %p idx: %d marked with tag: %ld", curr_bar, i, curr_bar->tag); task->bar = curr_bar; @@ -135,7 +128,8 @@ static inline ucc_status_t peer_find_free_barrier(ucc_tl_cuda_task_t *task) return UCC_ERR_NOT_FOUND; } -static ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) +static ucc_status_t +ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); @@ -230,7 +224,7 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.step % 2 * half_scratch_size); sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); st = ecopy(dbuf, sbuf, chunk_size, exec, - &task->bcast_linear.exec_task); + &task->bcast_linear.exec_task); if (st != UCC_OK) { ucc_error("failed to post ecopy task"); task->super.status = st; @@ -241,18 +235,16 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) etask = task->bcast_linear.exec_task; ucc_assert(NULL != etask); st = ucc_ee_executor_task_test(etask); - if (st == UCC_OK) { - ucc_ee_executor_task_finalize(etask); - task->bcast_linear.exec_task = NULL; - // signal others - ++task->bcast_linear.step; - set_rank_step(task, task->bcast_linear.root, - task->bcast_linear.step, 0); - task->bcast_linear.stage = STAGE_WAIT_ALL; - } else { - // not ready - return; + if (st != UCC_OK) { + return; // not ready } + ucc_ee_executor_task_finalize(etask); + task->bcast_linear.exec_task = NULL; + // signal others + ++task->bcast_linear.step; + set_rank_step(task, task->bcast_linear.root, + task->bcast_linear.step, 0); + task->bcast_linear.stage = STAGE_WAIT_ALL; case STAGE_WAIT_ALL: for (i = 0; i < tsize; ++i) { if (UCC_COLL_ARGS_ACTIVE_SET(&TASK_ARGS(task))) { @@ -262,7 +254,8 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) peer = i; } // need to wait until all ranks complete step - 1, because of double buffering - if (get_rank_step(task, peer, 0) < task->bcast_linear.step - 1) { + if (get_rank_step(task, peer, 0) < + task->bcast_linear.step - 1) { // rank is not ready, lets wait return; } @@ -272,16 +265,15 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) // go to next iteration task->bcast_linear.stage = STAGE_COPY; return; - } else { - // finish - st = ucc_tl_cuda_shm_barrier_start(trank, task->bar); - if (ucc_unlikely(st != UCC_OK)) { - ucc_error("failed to start barrier from root rank"); - task->super.status = st; - return; - } - task->bcast_linear.stage = STAGE_WAIT_COMPLETION; } + // finish + st = ucc_tl_cuda_shm_barrier_start(trank, task->bar); + if (ucc_unlikely(st != UCC_OK)) { + ucc_error("failed to start barrier from root rank"); + task->super.status = st; + return; + } + task->bcast_linear.stage = STAGE_WAIT_COMPLETION; case STAGE_WAIT_COMPLETION: st = ucc_tl_cuda_shm_barrier_test(trank, task->bar); if (st != UCC_OK) { @@ -292,10 +284,12 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) // set barrier free to unlock others, this is roots responsibility ucc_debug("Release bar: %p with tag: %ld", task->bar, task->bar->tag); - task->bar->tag = UCC_TAG_FREE; + task->bar->tag = UCC_TL_CUDA_TAG_FREE; ucc_tl_cuda_put_sync_root(task, task->bcast_linear.root); task->super.status = UCC_OK; + break; default: + ucc_assert(0); break; } } else { @@ -316,7 +310,7 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root), task->bcast_linear.step % 2 * chunk_size); st = ecopy(dbuf, sbuf, chunk_size, exec, - &task->bcast_linear.exec_task); + &task->bcast_linear.exec_task); if (st != UCC_OK) { ucc_error("failed to post ecopy task at client"); task->super.status = st; @@ -327,24 +321,24 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) etask = task->bcast_linear.exec_task; ucc_assert(NULL != etask); st = ucc_ee_executor_task_test(etask); - if (st == UCC_OK) { - ucc_ee_executor_task_finalize(etask); - task->bcast_linear.exec_task = NULL; - ++task->bcast_linear.step; - set_rank_step(task, trank, task->bcast_linear.step, 0); - if (task->bcast_linear.step < task->bcast_linear.num_steps) { - task->bcast_linear.stage = STAGE_WAIT_ROOT; - return; - } else { - // start barrier to sync with root - task->bcast_linear.stage = STAGE_CLIENT_WAIT_COMPLETION; - st = ucc_tl_cuda_shm_barrier_start(trank, task->bar); - if (ucc_unlikely(st != UCC_OK)) { - ucc_error("failed to start barrier from peer rank"); - task->super.status = st; - return; - } - } + if (st != UCC_OK) { + return; // executor task is not ready + } + ucc_ee_executor_task_finalize(etask); + task->bcast_linear.exec_task = NULL; + ++task->bcast_linear.step; + set_rank_step(task, trank, task->bcast_linear.step, 0); + if (task->bcast_linear.step < task->bcast_linear.num_steps) { + task->bcast_linear.stage = STAGE_WAIT_ROOT; + return; + } + // start barrier to sync with root + task->bcast_linear.stage = STAGE_CLIENT_WAIT_COMPLETION; + st = ucc_tl_cuda_shm_barrier_start(trank, task->bar); + if (ucc_unlikely(st != UCC_OK)) { + ucc_error("failed to start barrier from peer rank"); + task->super.status = st; + return; } break; case STAGE_CLIENT_WAIT_COMPLETION: @@ -355,7 +349,9 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } task->super.status = UCC_OK; + break; default: + ucc_assert(0); break; } } @@ -373,9 +369,12 @@ static ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) // in case of active set bcast we need to do additional steps to find free barriers if (UCC_COLL_ARGS_ACTIVE_SET(&TASK_ARGS(task))) { - task->bcast_linear.stage = UCC_TL_TEAM_RANK(team) == task->bcast_linear.root ? STAGE_INIT_BAR_ROOT : STAGE_FIND_BAR_PEER; + task->bcast_linear.stage = + UCC_TL_TEAM_RANK(team) == task->bcast_linear.root + ? STAGE_INIT_BAR_ROOT + : STAGE_FIND_BAR_PEER; } - + task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count; task->bcast_linear.num_steps = ucc_div_round_up(task->bcast_linear.size, half_scratch_size); diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c index 1e1d75c3ed..237005c95b 100644 --- a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c +++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c index 9a025267ca..36801ce1d8 100644 --- a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c +++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c index d954e38e9e..eef433cdbb 100644 --- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c +++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c index d719632853..56e4e2204c 100644 --- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c +++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c index 18135fae00..842db59c72 100644 --- a/src/components/tl/cuda/tl_cuda.c +++ b/src/components/tl/cuda/tl_cuda.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index d61c032261..9742ac8ba2 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. * * See file LICENSE for terms. @@ -74,7 +74,7 @@ extern ucc_tl_cuda_iface_t ucc_tl_cuda; typedef struct ucc_tl_cuda_lib_config { ucc_tl_lib_config_t super; - uint32_t max_concurrent; + uint32_t max_concurrent; // Maximum number of tasks that can be progressed simultaneously. size_t scratch_size; unsigned long allgather_ring_max_rings; uint32_t allgather_ring_num_chunks; @@ -106,7 +106,7 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_context_t, const ucc_base_context_params_t *, typedef uint32_t ucc_tl_cuda_sync_state_t; -#define UCC_TAG_FREE 0xFFFFFFFFFFFFFFFF +#define UCC_TL_CUDA_TAG_FREE 0xFFFFFFFFFFFFFFFF typedef struct ucc_tl_cuda_shm_barrier { ucc_rank_t size; @@ -180,7 +180,7 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *, typedef struct ucc_tl_cuda_task ucc_tl_cuda_task_t; struct ucc_tl_cuda_task { ucc_coll_task_t super; - uint32_t seq_num; // Sequential identifier for each taks started within the team + uint32_t seq_num; // Sequential identifier for each task started within the team uint32_t coll_id; // Index of the collective task in flight, within the range [0; max_concurrent) ucc_tl_cuda_shm_barrier_t *bar; // Pointer to the reserved barrier for this task in the CUDA team ucc_subset_t subset; // Mapping information for the active set, if it is present diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c index ab2a5ff733..71325dc826 100644 --- a/src/components/tl/cuda/tl_cuda_coll.c +++ b/src/components/tl/cuda/tl_cuda_coll.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index bedc531465..328db791ab 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/tl_cuda_ring.h b/src/components/tl/cuda/tl_cuda_ring.h index 621e074184..13835df99d 100644 --- a/src/components/tl/cuda/tl_cuda_ring.h +++ b/src/components/tl/cuda/tl_cuda_ring.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c index 8fe668249a..3b8a5fb253 100644 --- a/src/components/tl/cuda/tl_cuda_team.c +++ b/src/components/tl/cuda/tl_cuda_team.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -22,6 +22,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, ucc_derived_of(tl_context, ucc_tl_cuda_context_t); ucc_tl_cuda_lib_t *lib = ucc_derived_of(tl_context->lib, ucc_tl_cuda_lib_t); + // Number of preallocated resource groups for tasks, including the active set. + uint32_t resource_num = lib->cfg.max_concurrent * 2; ucc_tl_cuda_shm_barrier_t *bar; ucc_status_t status; int shm_id, i, j; @@ -46,7 +48,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, } // active set - scratch_size = 2 * lib->cfg.max_concurrent * lib->cfg.scratch_size; + scratch_size = resource_num * lib->cfg.scratch_size; status = CUDA_FUNC(cudaMalloc(&self->scratch.loc, scratch_size)); if (status != UCC_OK) { tl_error(tl_context->lib, "failed to alloc scratch buffer"); @@ -79,12 +81,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, goto ids_exchange; } memset(self->sync, 0, ctrl_size); - self->bar = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(self, 0, - lib->cfg.max_concurrent * 2); + self->bar = (ucc_tl_cuda_shm_barrier_t *)UCC_TL_CUDA_TEAM_SYNC( + self, 0, resource_num); /* active set */ - for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { + for (i = 0; i < resource_num; i++) { bar = UCC_TL_CUDA_TEAM_BARRIER(self, i); - bar->tag = UCC_TAG_FREE; // mark as free + bar->tag = UCC_TL_CUDA_TAG_FREE; // mark as free for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) { status = ucc_tl_cuda_shm_barrier_init(UCC_TL_TEAM_SIZE(self), j, bar); @@ -132,6 +134,8 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_cuda_team_t) { ucc_tl_cuda_lib_t *lib = ucc_derived_of(self->super.super.context->lib, ucc_tl_cuda_lib_t); + // Number of preallocated resource groups for tasks, including the active set. + uint32_t resource_num = lib->cfg.max_concurrent * 2; ucc_tl_cuda_sync_t *sync; cudaError_t st; int i, j; @@ -142,7 +146,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_cuda_team_t) } if (self->ids) { if (self->sync != (void*)-1) { - for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { + for (i = 0; i < resource_num; i++) { for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) { if (j == UCC_TL_TEAM_RANK(self)) { continue; @@ -204,6 +208,8 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team) ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); ucc_tl_cuda_lib_t *lib = ucc_derived_of(tl_team->context->lib, ucc_tl_cuda_lib_t); + // Number of preallocated resource groups for tasks, including the active set. + uint32_t resource_num = lib->cfg.max_concurrent * 2; ucc_status_t status; ucc_tl_cuda_sync_t *sync; ucc_tl_cuda_shm_barrier_t *bar; @@ -273,15 +279,14 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team) goto exit_err; } team->bar = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(team, 0, - lib->cfg.max_concurrent * 2); + resource_num); } team->sync_state = (ucc_tl_cuda_sync_state_t*)PTR_OFFSET(team->bar, sizeof(ucc_tl_cuda_shm_barrier_t) * - lib->cfg.max_concurrent * 2); + resource_num); CUDA_CHECK_GOTO(cudaStreamCreateWithFlags(&team->stream, cudaStreamNonBlocking), exit_err, status); - // second max_concurent events are unused for bcast active set - for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { + for (i = 0; i < resource_num; i++) { sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i); CUDA_CHECK_GOTO(cudaEventCreateWithFlags(&sync->ipc_event_local, cudaEventDisableTiming | @@ -309,7 +314,7 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team) goto exit_err; } - for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { + for (i = 0; i < resource_num; i++) { sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i); for (j = 0 ; j < UCC_TL_TEAM_SIZE(team); j++) { if (j == UCC_TL_TEAM_RANK(team)) { diff --git a/src/components/tl/cuda/tl_cuda_team_topo.h b/src/components/tl/cuda/tl_cuda_team_topo.h index 1d7d19ad1c..a56b28bf21 100644 --- a/src/components/tl/cuda/tl_cuda_team_topo.h +++ b/src/components/tl/cuda/tl_cuda_team_topo.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/ucp/bcast/bcast_knomial.c b/src/components/tl/ucp/bcast/bcast_knomial.c index 21f47bf293..62430024bf 100644 --- a/src/components/tl/ucp/bcast/bcast_knomial.c +++ b/src/components/tl/ucp/bcast/bcast_knomial.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -22,7 +22,7 @@ void ucc_tl_ucp_bcast_knomial_progress(ucc_coll_task_t *coll_task) ucc_rank_t size = (ucc_rank_t)task->subset.map.ep_num; uint32_t radix = task->bcast_kn.radix; - ucc_rank_t root = (uint32_t)TASK_ARGS(task).root; + ucc_rank_t root = (ucc_rank_t)TASK_ARGS(task).root; ucc_rank_t dist = task->bcast_kn.dist; void *buffer = TASK_ARGS(task).src.info.buffer; ucc_memory_type_t mtype = TASK_ARGS(task).src.info.mem_type; diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc index 69f697a508..5ceca3545f 100644 --- a/test/gtest/coll/test_bcast.cc +++ b/test/gtest/coll/test_bcast.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * See file LICENSE for terms. */