From 6a63062c6cda28cf6c11b12717044c241774c89f Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Mon, 30 Dec 2024 12:47:23 +0100
Subject: [PATCH] TL/CUDA: addressed comments

---
 src/components/tl/cuda/Makefile.am            |   2 +-
 src/components/tl/cuda/allgather/allgather.c  |   2 +-
 .../tl/cuda/allgather/allgather_linear.c      |   2 +-
 .../tl/cuda/allgatherv/allgatherv.c           |   2 +-
 .../tl/cuda/allgatherv/allgatherv_linear.c    |   2 +-
 src/components/tl/cuda/bcast/bcast.c          |   2 +-
 src/components/tl/cuda/bcast/bcast.h          |   2 +-
 src/components/tl/cuda/bcast/bcast_linear.c   | 143 +++++++++---------
 .../tl/cuda/reduce_scatter/reduce_scatter.c   |   2 +-
 .../reduce_scatter/reduce_scatter_linear.c    |   2 +-
 .../tl/cuda/reduce_scatterv/reduce_scatterv.c |   2 +-
 .../reduce_scatterv/reduce_scatterv_linear.c  |   2 +-
 src/components/tl/cuda/tl_cuda.c              |   2 +-
 src/components/tl/cuda/tl_cuda.h              |   8 +-
 src/components/tl/cuda/tl_cuda_coll.c         |   2 +-
 src/components/tl/cuda/tl_cuda_coll.h         |   2 +-
 src/components/tl/cuda/tl_cuda_ring.h         |   2 +-
 src/components/tl/cuda/tl_cuda_team.c         |  29 ++--
 src/components/tl/cuda/tl_cuda_team_topo.h    |   2 +-
 src/components/tl/ucp/bcast/bcast_knomial.c   |   4 +-
 test/gtest/coll/test_bcast.cc                 |   2 +-
 21 files changed, 111 insertions(+), 107 deletions(-)

diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am
index 2136821b93..65fb41ca1f 100644
--- a/src/components/tl/cuda/Makefile.am
+++ b/src/components/tl/cuda/Makefile.am
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
 #
 
diff --git a/src/components/tl/cuda/allgather/allgather.c b/src/components/tl/cuda/allgather/allgather.c
index 1e64c0a582..362191b3ac 100644
--- a/src/components/tl/cuda/allgather/allgather.c
+++ b/src/components/tl/cuda/allgather/allgather.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c
index fefc774628..d0b416257b 100644
--- a/src/components/tl/cuda/allgather/allgather_linear.c
+++ b/src/components/tl/cuda/allgather/allgather_linear.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/allgatherv/allgatherv.c b/src/components/tl/cuda/allgatherv/allgatherv.c
index 76da65fa65..4a73bbdf08 100644
--- a/src/components/tl/cuda/allgatherv/allgatherv.c
+++ b/src/components/tl/cuda/allgatherv/allgatherv.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
index 1f02ad37bd..9a8b5db140 100644
--- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c
+++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c
index d687d924a0..954cf86d9f 100644
--- a/src/components/tl/cuda/bcast/bcast.c
+++ b/src/components/tl/cuda/bcast/bcast.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/bcast/bcast.h b/src/components/tl/cuda/bcast/bcast.h
index 17d07a529b..5810bcc89d 100644
--- a/src/components/tl/cuda/bcast/bcast.h
+++ b/src/components/tl/cuda/bcast/bcast.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 8b33512006..9b2915fa39 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -8,45 +8,38 @@
 
 enum {
     // Barrier setup stages
-    STAGE_INIT_BAR_ROOT,            // Initial stage for the root rank to identify and claim a free barrier
-    STAGE_FIND_BAR_PEER,            // Stage where peer ranks wait while the root rank identifies a free barrier
+    STAGE_INIT_BAR_ROOT,          // Initial stage for the root rank to identify and claim a free barrier
+    STAGE_FIND_BAR_PEER,          // Stage where peer ranks wait while the root rank identifies a free barrier
 
-    STAGE_SYNC,                     // Initialize the barrier and synchronize the segment required for the current task
-    STAGE_SETUP,                    // Verify that all ranks are aligned and have reached the barrier
+    STAGE_SYNC,                   // Initialize the barrier and synchronize the segment required for the current task
+    STAGE_SETUP,                  // Verify that all ranks are aligned and have reached the barrier
     // Stages specific to the root rank
-    STAGE_COPY,                     // Post copy task: copy data block from src to a scratch buffer
-    STAGE_WAIT_COPY,                // The root waits for the completion of its copy operation
-    STAGE_WAIT_ALL,                 // The root rank waits until all other ranks have reached the same operational step
-    STAGE_WAIT_COMPLETION,          // The root rank waits for all other ranks to complete the broadcast operation
+    STAGE_COPY,                   // Post copy task: copy data block from src to a scratch buffer
+    STAGE_WAIT_COPY,              // The root waits for the completion of its copy operation
+    STAGE_WAIT_ALL,               // The root rank waits until all other ranks have reached the same operational step
+    STAGE_WAIT_COMPLETION,        // The root rank waits for all other ranks to complete the broadcast operation
     // non-root
-    STAGE_WAIT_ROOT,                // Wait while the root rank writes data to its scratch buffer
-    STAGE_CLIENT_COPY,              // Initiate their own copy tasks after the root's operations
-    STAGE_CLIENT_COPY_WAIT,         // Wait for the completion of the copy operation from the root's scratch buffer
-    STAGE_CLIENT_WAIT_COMPLETION,   // Wait for the completion of algorithm on all ranks, global sync with root
+    STAGE_WAIT_ROOT,              // Wait while the root rank writes data to its scratch buffer
+    STAGE_CLIENT_COPY,            // Initiate their own copy tasks after the root's operations
+    STAGE_CLIENT_COPY_WAIT,       // Wait for the completion of the copy operation from the root's scratch buffer
+    STAGE_CLIENT_WAIT_COMPLETION, // Wait for the completion of algorithm on all ranks, global sync with root
 };
 
-static inline ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
+static inline ucc_status_t
+ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
 {
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
     ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
-    ucc_status_t        status;
 
     set_rank_step(task, trank, 0, 0); // Initialize rank step tracking
     ucc_memory_cpu_store_fence();
     // initiate barrier wait while all ranks set theirs steps to 0
-    status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar);
-    if (ucc_unlikely(status != UCC_OK)) {
-        goto exit_err;
-    }
-
-    return UCC_OK;
-
-exit_err:
-    return status;
+    return ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar);
 }
 
 // Tests if setup is complete for a linear broadcast task
-static inline ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task)
+static inline ucc_status_t
+ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task)
 {
     ucc_tl_cuda_team_t *team = TASK_TEAM(task);
     return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar);
@@ -85,8 +78,8 @@ static inline ucc_status_t root_find_free_barrier(ucc_tl_cuda_task_t *task)
     for (i = 0; i < max_concurrent; ++i) {
         curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i);
         // try to set user specified tag to mark that this barrier is used by this task
-        if (ucc_atomic_cswap64(&curr_bar->tag, UCC_TAG_FREE,
-                               task->bcast_linear.key) == UCC_TAG_FREE) {
+        if (ucc_atomic_cswap64(&curr_bar->tag, UCC_TL_CUDA_TAG_FREE,
+                               task->bcast_linear.key) == UCC_TL_CUDA_TAG_FREE) {
             ucc_debug("Acquire barrier: %p idx: %d marked with tag: %ld",
                       curr_bar, i, curr_bar->tag);
             task->bar = curr_bar;
@@ -135,7 +128,8 @@ static inline ucc_status_t peer_find_free_barrier(ucc_tl_cuda_task_t *task)
     return UCC_ERR_NOT_FOUND;
 }
 
-static ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
+static ucc_status_t
+ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
 
@@ -230,7 +224,7 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                               task->bcast_linear.step % 2 * half_scratch_size);
             sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
             st   = ecopy(dbuf, sbuf, chunk_size, exec,
-                         &task->bcast_linear.exec_task);
+                       &task->bcast_linear.exec_task);
             if (st != UCC_OK) {
                 ucc_error("failed to post ecopy task");
                 task->super.status = st;
@@ -241,18 +235,16 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             etask = task->bcast_linear.exec_task;
             ucc_assert(NULL != etask);
             st = ucc_ee_executor_task_test(etask);
-            if (st == UCC_OK) {
-                ucc_ee_executor_task_finalize(etask);
-                task->bcast_linear.exec_task = NULL;
-                // signal others
-                ++task->bcast_linear.step;
-                set_rank_step(task, task->bcast_linear.root,
-                              task->bcast_linear.step, 0);
-                task->bcast_linear.stage = STAGE_WAIT_ALL;
-            } else {
-                // not ready
-                return;
+            if (st != UCC_OK) {
+                return; // not ready
             }
+            ucc_ee_executor_task_finalize(etask);
+            task->bcast_linear.exec_task = NULL;
+            // signal others
+            ++task->bcast_linear.step;
+            set_rank_step(task, task->bcast_linear.root,
+                          task->bcast_linear.step, 0);
+            task->bcast_linear.stage = STAGE_WAIT_ALL;
         case STAGE_WAIT_ALL:
             for (i = 0; i < tsize; ++i) {
                 if (UCC_COLL_ARGS_ACTIVE_SET(&TASK_ARGS(task))) {
@@ -262,7 +254,8 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     peer = i;
                 }
                 // need to wait until all ranks complete step - 1, because of double buffering
-                if (get_rank_step(task, peer, 0) < task->bcast_linear.step - 1) {
+                if (get_rank_step(task, peer, 0) <
+                    task->bcast_linear.step - 1) {
                     // rank is not ready, lets wait
                     return;
                 }
@@ -272,16 +265,15 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 // go to next iteration
                 task->bcast_linear.stage = STAGE_COPY;
                 return;
-            } else {
-                // finish
-                st = ucc_tl_cuda_shm_barrier_start(trank, task->bar);
-                if (ucc_unlikely(st != UCC_OK)) {
-                    ucc_error("failed to start barrier from root rank");
-                    task->super.status = st;
-                    return;
-                }
-                task->bcast_linear.stage = STAGE_WAIT_COMPLETION;
             }
+            // finish
+            st = ucc_tl_cuda_shm_barrier_start(trank, task->bar);
+            if (ucc_unlikely(st != UCC_OK)) {
+                ucc_error("failed to start barrier from root rank");
+                task->super.status = st;
+                return;
+            }
+            task->bcast_linear.stage = STAGE_WAIT_COMPLETION;
         case STAGE_WAIT_COMPLETION:
             st = ucc_tl_cuda_shm_barrier_test(trank, task->bar);
             if (st != UCC_OK) {
@@ -292,10 +284,12 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             // set barrier free to unlock others, this is roots responsibility
             ucc_debug("Release bar: %p with tag: %ld", task->bar,
                       task->bar->tag);
-            task->bar->tag = UCC_TAG_FREE;
+            task->bar->tag = UCC_TL_CUDA_TAG_FREE;
             ucc_tl_cuda_put_sync_root(task, task->bcast_linear.root);
             task->super.status = UCC_OK;
+            break;
         default:
+            ucc_assert(0);
             break;
         }
     } else {
@@ -316,7 +310,7 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root),
                               task->bcast_linear.step % 2 * chunk_size);
             st   = ecopy(dbuf, sbuf, chunk_size, exec,
-                         &task->bcast_linear.exec_task);
+                       &task->bcast_linear.exec_task);
             if (st != UCC_OK) {
                 ucc_error("failed to post ecopy task at client");
                 task->super.status = st;
@@ -327,24 +321,24 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             etask = task->bcast_linear.exec_task;
             ucc_assert(NULL != etask);
             st = ucc_ee_executor_task_test(etask);
-            if (st == UCC_OK) {
-                ucc_ee_executor_task_finalize(etask);
-                task->bcast_linear.exec_task = NULL;
-                ++task->bcast_linear.step;
-                set_rank_step(task, trank, task->bcast_linear.step, 0);
-                if (task->bcast_linear.step < task->bcast_linear.num_steps) {
-                    task->bcast_linear.stage = STAGE_WAIT_ROOT;
-                    return;
-                } else {
-                    // start barrier to sync with root
-                    task->bcast_linear.stage = STAGE_CLIENT_WAIT_COMPLETION;
-                    st = ucc_tl_cuda_shm_barrier_start(trank, task->bar);
-                    if (ucc_unlikely(st != UCC_OK)) {
-                        ucc_error("failed to start barrier from peer rank");
-                        task->super.status = st;
-                        return;
-                    }
-                }
+            if (st != UCC_OK) {
+                return; // executor task is not ready
+            }
+            ucc_ee_executor_task_finalize(etask);
+            task->bcast_linear.exec_task = NULL;
+            ++task->bcast_linear.step;
+            set_rank_step(task, trank, task->bcast_linear.step, 0);
+            if (task->bcast_linear.step < task->bcast_linear.num_steps) {
+                task->bcast_linear.stage = STAGE_WAIT_ROOT;
+                return;
+            }
+            // start barrier to sync with root
+            task->bcast_linear.stage = STAGE_CLIENT_WAIT_COMPLETION;
+            st = ucc_tl_cuda_shm_barrier_start(trank, task->bar);
+            if (ucc_unlikely(st != UCC_OK)) {
+                ucc_error("failed to start barrier from peer rank");
+                task->super.status = st;
+                return;
             }
             break;
         case STAGE_CLIENT_WAIT_COMPLETION:
@@ -355,7 +349,9 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 return;
             }
             task->super.status = UCC_OK;
+            break;
         default:
+            ucc_assert(0);
             break;
         }
     }
@@ -373,9 +369,12 @@ static ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 
     // in case of active set bcast we need to do additional steps to find free barriers
     if (UCC_COLL_ARGS_ACTIVE_SET(&TASK_ARGS(task))) {
-        task->bcast_linear.stage = UCC_TL_TEAM_RANK(team) == task->bcast_linear.root ? STAGE_INIT_BAR_ROOT : STAGE_FIND_BAR_PEER;
+        task->bcast_linear.stage =
+            UCC_TL_TEAM_RANK(team) == task->bcast_linear.root
+                ? STAGE_INIT_BAR_ROOT
+                : STAGE_FIND_BAR_PEER;
     }
-    
+
     task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count;
     task->bcast_linear.num_steps =
         ucc_div_round_up(task->bcast_linear.size, half_scratch_size);
diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
index 1e1d75c3ed..237005c95b 100644
--- a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
+++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
index 9a025267ca..36801ce1d8 100644
--- a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
+++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
index d954e38e9e..eef433cdbb 100644
--- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
+++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
index d719632853..56e4e2204c 100644
--- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
+++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c
index 18135fae00..842db59c72 100644
--- a/src/components/tl/cuda/tl_cuda.c
+++ b/src/components/tl/cuda/tl_cuda.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index d61c032261..9742ac8ba2 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
  *
  * See file LICENSE for terms.
@@ -74,7 +74,7 @@ extern ucc_tl_cuda_iface_t ucc_tl_cuda;
 
 typedef struct ucc_tl_cuda_lib_config {
     ucc_tl_lib_config_t super;
-    uint32_t            max_concurrent;
+    uint32_t            max_concurrent; // Maximum number of tasks that can be progressed simultaneously.
     size_t              scratch_size;
     unsigned long       allgather_ring_max_rings;
     uint32_t            allgather_ring_num_chunks;
@@ -106,7 +106,7 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_context_t, const ucc_base_context_params_t *,
 
 typedef uint32_t ucc_tl_cuda_sync_state_t;
 
-#define UCC_TAG_FREE 0xFFFFFFFFFFFFFFFF
+#define UCC_TL_CUDA_TAG_FREE 0xFFFFFFFFFFFFFFFF
 
 typedef struct ucc_tl_cuda_shm_barrier {
     ucc_rank_t   size;
@@ -180,7 +180,7 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *,
 typedef struct ucc_tl_cuda_task ucc_tl_cuda_task_t;
 struct ucc_tl_cuda_task {
     ucc_coll_task_t            super;
-    uint32_t                   seq_num; // Sequential identifier for each taks started within the team
+    uint32_t                   seq_num; // Sequential identifier for each task started within the team
     uint32_t                   coll_id; // Index of the collective task in flight, within the range [0; max_concurrent)
     ucc_tl_cuda_shm_barrier_t *bar;     // Pointer to the reserved barrier for this task in the CUDA team
     ucc_subset_t               subset;  // Mapping information for the active set, if it is present
diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c
index ab2a5ff733..71325dc826 100644
--- a/src/components/tl/cuda/tl_cuda_coll.c
+++ b/src/components/tl/cuda/tl_cuda_coll.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index bedc531465..328db791ab 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/tl_cuda_ring.h b/src/components/tl/cuda/tl_cuda_ring.h
index 621e074184..13835df99d 100644
--- a/src/components/tl/cuda/tl_cuda_ring.h
+++ b/src/components/tl/cuda/tl_cuda_ring.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c
index 8fe668249a..3b8a5fb253 100644
--- a/src/components/tl/cuda/tl_cuda_team.c
+++ b/src/components/tl/cuda/tl_cuda_team.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -22,6 +22,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
         ucc_derived_of(tl_context, ucc_tl_cuda_context_t);
     ucc_tl_cuda_lib_t     *lib =
         ucc_derived_of(tl_context->lib, ucc_tl_cuda_lib_t);
+    // Number of preallocated resource groups for tasks, including the active set.
+    uint32_t      resource_num = lib->cfg.max_concurrent * 2;
     ucc_tl_cuda_shm_barrier_t *bar;
     ucc_status_t status;
     int shm_id, i, j;
@@ -46,7 +48,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
     }
 
     // active set
-    scratch_size = 2 * lib->cfg.max_concurrent * lib->cfg.scratch_size;
+    scratch_size = resource_num * lib->cfg.scratch_size;
     status = CUDA_FUNC(cudaMalloc(&self->scratch.loc, scratch_size));
     if (status != UCC_OK) {
         tl_error(tl_context->lib, "failed to alloc scratch buffer");
@@ -79,12 +81,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
             goto ids_exchange;
         }
         memset(self->sync, 0, ctrl_size);
-        self->bar  = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(self, 0,
-                                                       lib->cfg.max_concurrent * 2);
+        self->bar = (ucc_tl_cuda_shm_barrier_t *)UCC_TL_CUDA_TEAM_SYNC(
+            self, 0, resource_num);
         /* active set */
-        for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
+        for (i = 0; i < resource_num; i++) {
             bar = UCC_TL_CUDA_TEAM_BARRIER(self, i);
-            bar->tag = UCC_TAG_FREE; // mark as free
+            bar->tag = UCC_TL_CUDA_TAG_FREE; // mark as free
             for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) {
                 status = ucc_tl_cuda_shm_barrier_init(UCC_TL_TEAM_SIZE(self),
                                                       j, bar);
@@ -132,6 +134,8 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_cuda_team_t)
 {
     ucc_tl_cuda_lib_t *lib = ucc_derived_of(self->super.super.context->lib,
                                             ucc_tl_cuda_lib_t);
+    // Number of preallocated resource groups for tasks, including the active set.
+    uint32_t  resource_num = lib->cfg.max_concurrent * 2;
     ucc_tl_cuda_sync_t *sync;
     cudaError_t st;
     int i, j;
@@ -142,7 +146,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_cuda_team_t)
     }
     if (self->ids) {
         if (self->sync != (void*)-1) {
-            for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
+            for (i = 0; i < resource_num; i++) {
                 for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) {
                     if (j == UCC_TL_TEAM_RANK(self)) {
                         continue;
@@ -204,6 +208,8 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team)
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
     ucc_tl_cuda_lib_t  *lib  = ucc_derived_of(tl_team->context->lib,
                                               ucc_tl_cuda_lib_t);
+    // Number of preallocated resource groups for tasks, including the active set.
+    uint32_t    resource_num = lib->cfg.max_concurrent * 2;
     ucc_status_t status;
     ucc_tl_cuda_sync_t *sync;
     ucc_tl_cuda_shm_barrier_t *bar;
@@ -273,15 +279,14 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team)
             goto exit_err;
         }
         team->bar = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(team, 0,
-                                                       lib->cfg.max_concurrent * 2);
+                                                       resource_num);
     }
     team->sync_state = (ucc_tl_cuda_sync_state_t*)PTR_OFFSET(team->bar,
                             sizeof(ucc_tl_cuda_shm_barrier_t) *
-                            lib->cfg.max_concurrent * 2);
+                            resource_num);
     CUDA_CHECK_GOTO(cudaStreamCreateWithFlags(&team->stream,
                     cudaStreamNonBlocking), exit_err, status);
-    // second max_concurent events are unused for bcast active set
-    for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
+    for (i = 0; i < resource_num; i++) {
         sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i);
         CUDA_CHECK_GOTO(cudaEventCreateWithFlags(&sync->ipc_event_local,
                                                 cudaEventDisableTiming |
@@ -309,7 +314,7 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team)
         goto exit_err;
     }
 
-    for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
+    for (i = 0; i < resource_num; i++) {
         sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i);
         for (j = 0 ; j < UCC_TL_TEAM_SIZE(team); j++) {
             if (j == UCC_TL_TEAM_RANK(team)) {
diff --git a/src/components/tl/cuda/tl_cuda_team_topo.h b/src/components/tl/cuda/tl_cuda_team_topo.h
index 1d7d19ad1c..a56b28bf21 100644
--- a/src/components/tl/cuda/tl_cuda_team_topo.h
+++ b/src/components/tl/cuda/tl_cuda_team_topo.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
diff --git a/src/components/tl/ucp/bcast/bcast_knomial.c b/src/components/tl/ucp/bcast/bcast_knomial.c
index 21f47bf293..62430024bf 100644
--- a/src/components/tl/ucp/bcast/bcast_knomial.c
+++ b/src/components/tl/ucp/bcast/bcast_knomial.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -22,7 +22,7 @@ void ucc_tl_ucp_bcast_knomial_progress(ucc_coll_task_t *coll_task)
     ucc_rank_t         size      = (ucc_rank_t)task->subset.map.ep_num;
 
     uint32_t           radix     = task->bcast_kn.radix;
-    ucc_rank_t         root      = (uint32_t)TASK_ARGS(task).root;
+    ucc_rank_t         root      = (ucc_rank_t)TASK_ARGS(task).root;
     ucc_rank_t         dist      = task->bcast_kn.dist;
     void              *buffer    = TASK_ARGS(task).src.info.buffer;
     ucc_memory_type_t  mtype     = TASK_ARGS(task).src.info.mem_type;
diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc
index 69f697a508..5ceca3545f 100644
--- a/test/gtest/coll/test_bcast.cc
+++ b/test/gtest/coll/test_bcast.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * See file LICENSE for terms.
  */