Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TL/CUDA: Linear Broadcast for GPU #948

Open
wants to merge 57 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
8365243
TL/CUDA: add linear bcast
ikryukov Mar 22, 2024
9813575
TL/CUDA: fix build
ikryukov Mar 22, 2024
cf5b239
TL/CUDA: wip
ikryukov Jun 30, 2024
e9fdd88
TL/CUDA: fix compilation
ikryukov Jul 1, 2024
4c93274
TL/CUDA: calc size
ikryukov Jul 1, 2024
3e5a47f
TL/CUDA: wip some logic for root
ikryukov Jul 1, 2024
1ca6ecf
TL/CUDA: wip logic for client
ikryukov Jul 2, 2024
6233440
TL/CUDA: added barrier to sync stages
ikryukov Jul 3, 2024
2ca25de
TL/CUDA: non zero root
ikryukov Jul 3, 2024
e8ab6dc
TL/CUDA: revert commented
ikryukov Jul 3, 2024
a5df3c9
TL/CUDA: wip multistep
ikryukov Jul 3, 2024
380f52c
TL/CUDA: fix step check
ikryukov Jul 4, 2024
e6b1ff6
TL/CUDA: minor cleanup
ikryukov Jul 4, 2024
ebf277c
TL/CUDA: removed breaks
ikryukov Jul 5, 2024
b1e9a5f
TL/CUDA: fix linter
ikryukov Jul 6, 2024
7ac6c39
TL/CUDA: double buffering
ikryukov Jul 11, 2024
38f0cd2
TL/CUDA: moved get/set rank step
ikryukov Aug 2, 2024
c193097
TL/CUDA: changed logs to debug lvl
ikryukov Aug 2, 2024
49049e8
TL/CUDA: minor cleanups
ikryukov Aug 2, 2024
5d4cab8
TL/CUDA: addressed comments
ikryukov Aug 21, 2024
b446271
TL/CUDA: removed done stage
ikryukov Aug 23, 2024
7c463f6
TL/CUDA: added unit test
ikryukov Aug 23, 2024
7e3538c
TL/CUDA: addressed comments
ikryukov Sep 3, 2024
198ec1c
TL/CUDA: fix formatting
ikryukov Sep 3, 2024
b49baf6
TL/CUDA: fixed compilation
ikryukov Sep 3, 2024
1bd78db
TL/CUDA: fix include
ikryukov Oct 25, 2024
b1b6c1e
TL/CUDA: removed returns
ikryukov Oct 28, 2024
d9c452b
TL/CUDA: active set support
ikryukov Nov 6, 2024
9eadde5
TL/CUDA: fix build
ikryukov Nov 6, 2024
5c5ae84
TL/CUDA: fixed comments
ikryukov Nov 6, 2024
d3a60be
TL/CUDA: select free bar using atomic
ikryukov Nov 7, 2024
a23b68a
TL/CUDA: fix
ikryukov Nov 8, 2024
ac83e63
TL/CUDA: replace free tag
ikryukov Nov 8, 2024
c083fa2
TL/CUDA: fix bar tag init val
ikryukov Nov 8, 2024
67edb77
TL/CUDA: added tag print
ikryukov Nov 11, 2024
3fc1c3e
TL/CUDA: changed tag to 64bits
ikryukov Nov 12, 2024
40c6ea8
TL/CUDA: fixed linter errors
ikryukov Nov 12, 2024
644012c
TL/CUDA: bar init logic in progress
ikryukov Nov 18, 2024
62f6ba7
TL/CUDA: fix CI build
ikryukov Nov 18, 2024
43d1900
TL/CUDA: removed unused var
ikryukov Nov 18, 2024
b914f53
TL/CUDA: refactor bar init
ikryukov Nov 18, 2024
53ca155
TL/CUDA: fix ci
ikryukov Nov 18, 2024
1658638
TL/CUDA: added bar stage
ikryukov Nov 20, 2024
da95927
TL/CUDA: revert bar stage
ikryukov Nov 20, 2024
1dd83ba
TL/CUDA: free bar in progress
ikryukov Nov 21, 2024
b29bcf7
TL/CUDA: completion barrier
ikryukov Nov 22, 2024
e16d9d4
TL/CUDA: removed prints
ikryukov Nov 22, 2024
7b0d87f
TL/CUDA: fix format
ikryukov Nov 22, 2024
da0c021
TL/CUDA: remove unused atomic
ikryukov Nov 22, 2024
b0da90f
TL/CUDA: removed unused include
ikryukov Nov 22, 2024
5c1e28b
TL/CUDA: fix clang compilation
ikryukov Nov 22, 2024
da19729
TL/CUDA: fixed comments,format,removed bool
ikryukov Dec 5, 2024
ee09b83
TL/CUDA: hide functions
ikryukov Dec 9, 2024
904c671
TL/CUDA: fix bug in non active set version
ikryukov Dec 10, 2024
3486f89
TL/CUDA: fixed build
ikryukov Dec 10, 2024
9f2e9d1
TL/CUDA: added assertions
ikryukov Dec 12, 2024
ccc60a5
TL/CUDA: addressed comments
ikryukov Dec 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/components/tl/cuda/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
#

Expand Down Expand Up @@ -27,6 +27,11 @@ alltoallv = \
alltoallv/alltoallv.c \
alltoallv/alltoallv_ce.c

bcast = \
bcast/bcast.h \
bcast/bcast.c \
bcast/bcast_linear.c

reduce_scatter = \
reduce_scatter/reduce_scatter.h \
reduce_scatter/reduce_scatter.c \
Expand Down Expand Up @@ -54,6 +59,7 @@ sources = \
$(allgatherv) \
$(alltoall) \
$(alltoallv) \
$(bcast) \
$(reduce_scatter) \
$(reduce_scatterv)

Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/cuda/allgather/allgather.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -44,7 +44,7 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args,
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p);
} else {
return ucc_tl_cuda_allgather_ring_init(coll_args, tl_team, task_p);
Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/cuda/allgather/allgather_linear.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -15,7 +15,7 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
return UCC_ERR_NOT_SUPPORTED;
}
Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/cuda/allgatherv/allgatherv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -47,7 +47,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args,
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p);
} else {
return ucc_tl_cuda_allgatherv_ring_init(coll_args, tl_team, task_p);
Expand Down
20 changes: 2 additions & 18 deletions src/components/tl/cuda/allgatherv/allgatherv_linear.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -55,22 +55,6 @@ enum
* other ranks to finish */
};

static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
int step_id)
{
ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);

return sync->seq_num[step_id];
}

static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
int step, int step_id)
{
ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);

sync->seq_num[step_id] = step;
}

ucc_status_t ucc_tl_cuda_allgatherv_linear_finalize(ucc_coll_task_t *coll_task)
{
ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
Expand Down Expand Up @@ -432,7 +416,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_linear_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
return UCC_ERR_NOT_SUPPORTED;
}
Expand Down
28 changes: 28 additions & 0 deletions src/components/tl/cuda/bcast/bcast.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/**
* Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "bcast.h"
#include "components/mc/ucc_mc.h"

ucc_base_coll_alg_info_t
ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = {
[UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id = UCC_TL_CUDA_BCAST_ALG_LINEAR,
.name = "linear",
.desc = "linear bcast algorithm"},
[UCC_TL_CUDA_BCAST_ALG_LAST] = {.id = 0, .name = NULL, .desc = NULL}};

ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);

if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p);
} else {
return UCC_ERR_NOT_SUPPORTED;
}
}
43 changes: 43 additions & 0 deletions src/components/tl/cuda/bcast/bcast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/**
* Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#ifndef BCAST_H_
#define BCAST_H_

#include "tl_cuda.h"
#include "tl_cuda_coll.h"

enum
{
UCC_TL_CUDA_BCAST_ALG_LINEAR,
UCC_TL_CUDA_BCAST_ALG_LAST
};

extern ucc_base_coll_alg_info_t
ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1];

#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0"

ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p);

ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p);

static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str)
{
int i;
for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) {
if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) {
break;
}
}
return i;
}

#endif
Loading
Loading