Skip to content

Commit

Permalink
TL/CUDA: add linear bcast
Browse files Browse the repository at this point in the history
  • Loading branch information
ikryukov committed Apr 12, 2024
1 parent 06959b9 commit a67cacd
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 1 deletion.
5 changes: 5 additions & 0 deletions src/components/tl/cuda/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ alltoallv = \
alltoallv/alltoallv.c \
alltoallv/alltoallv_ce.c

bcast = \
bcast/bcast.h \
bcast/bcast.c \
bcast/bcast_linear.c

reduce_scatter = \
reduce_scatter/reduce_scatter.h \
reduce_scatter/reduce_scatter.c \
Expand Down
28 changes: 28 additions & 0 deletions src/components/tl/cuda/bcast/bcast.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/**
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "bcast.h"
#include "components/mc/ucc_mc.h"

ucc_base_coll_alg_info_t
ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = {
[UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id = UCC_TL_CUDA_BCAST_ALG_LINEAR,
.name = "linear",
.desc = "linear bcast algorithm"},
[UCC_TL_CUDA_BCAST_ALG_LAST] = {.id = 0, .name = NULL, .desc = NULL}};

ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p);
} else {
return UCC_ERR_NOT_SUPPORTED;
}
}
43 changes: 43 additions & 0 deletions src/components/tl/cuda/bcast/bcast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/**
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#ifndef BCAST_H_
#define BCAST_H_

#include "tl_cuda.h"
#include "tl_cuda_coll.h"

enum
{
UCC_TL_CUDA_BCAST_ALG_LINEAR,
UCC_TL_CUDA_BCAST_ALG_LAST
};

extern ucc_base_coll_alg_info_t
ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1];

#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0"

ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p);

ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p);

static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str)
{
int i;
for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) {
if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) {
break;
}
}
return i;
}

#endif
86 changes: 86 additions & 0 deletions src/components/tl/cuda/bcast/bcast_linear.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "bcast/bcast.h"

enum
{
STAGE_SYNC, /*< Wait for free SYNC segment */
STAGE_SETUP, /*< Wait for memhandle setup to finish */
STAGE_COPIES, /*< Linear algorithm is running */
STAGE_BARRIER, /*< Linear algorithm is done, waiting for
* other ranks to finish */
};

ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
{
ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);

tl_trace(UCC_TASK_LIB(task), "finalizing task %p", task);
ucc_tl_cuda_task_put(task);
return UCC_OK;
}

void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
{
ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
ucc_tl_cuda_team_t *team = TASK_TEAM(task);
ucc_status_t st;

task->super.status = UCC_INPROGRESS;
}

ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
{
ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
ucc_tl_cuda_team_t *team = TASK_TEAM(task);
ucc_coll_args_t * args = &TASK_ARGS(task);
ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team);
ucc_datatype_t dt = task->allgatherv_linear.dt;
ucc_rank_t i;
size_t send_size, frag_size, ssize;

task->bcast_linear.stage = STAGE_SYNC;
task->allgatherv_linear.sbuf = args->src.info.buffer;


return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
}

ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t * tl_team,
ucc_coll_task_t ** task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_tl_cuda_task_t *task;
ucc_status_t status;

if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
return UCC_ERR_NOT_SUPPORTED;
}

status = ucc_tl_cuda_task_init(coll_args, team, &task);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}

// task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count;
// task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset;
// task->allgatherv_linear.dt = coll_args->args.dst.info.datatype;
// task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer;
// task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer;

task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
task->super.post = ucc_tl_cuda_allgatherv_linear_start;
task->super.progress = ucc_tl_cuda_allgatherv_linear_progress;
task->super.finalize = ucc_tl_cuda_allgatherv_linear_finalize;
task->bar = TASK_BAR(task);

*task_p = &task->super;
return UCC_OK;
}

3 changes: 3 additions & 0 deletions src/components/tl/cuda/tl_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "components/mc/base/ucc_mc_base.h"
#include "allgather/allgather.h"
#include "allgatherv/allgatherv.h"
#include "bcast/bcast.h"
#include "reduce_scatter/reduce_scatter.h"
#include "reduce_scatterv/reduce_scatterv.h"

Expand Down Expand Up @@ -93,6 +94,8 @@ __attribute__((constructor)) static void tl_cuda_iface_init(void)
ucc_tl_cuda_allgather_algs;
ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_ALLGATHERV)] =
ucc_tl_cuda_allgatherv_algs;
ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_BCAST)] =
ucc_tl_cuda_bcast_algs;
ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTER)] =
ucc_tl_cuda_reduce_scatter_algs;
ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTERV)] =
Expand Down
5 changes: 5 additions & 0 deletions src/components/tl/cuda/tl_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#define UCC_TL_CUDA_SUPPORTED_COLLS \
(UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV | \
UCC_COLL_TYPE_ALLGATHER | UCC_COLL_TYPE_ALLGATHERV | \
UCC_COLL_TYPE_BCAST | \
UCC_COLL_TYPE_REDUCE_SCATTER | UCC_COLL_TYPE_REDUCE_SCATTERV)

#define UCC_TL_CUDA_TEAM_LIB(_team) \
Expand Down Expand Up @@ -224,6 +225,10 @@ struct ucc_tl_cuda_task {
size_t (*get_offset)(const ucc_tl_cuda_task_t *task,
ucc_rank_t block);
} allgatherv_linear;

struct {
int stage;
} bcast_linear;
struct {
int stage;
int num_frags;
Expand Down
16 changes: 16 additions & 0 deletions src/components/tl/cuda/tl_cuda_coll.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "alltoallv/alltoallv.h"
#include "allgather/allgather.h"
#include "allgatherv/allgatherv.h"
#include "bcast/bcast.h"
#include "reduce_scatter/reduce_scatter.h"
#include "reduce_scatterv/reduce_scatterv.h"
#include "utils/arch/cpu.h"
Expand All @@ -35,6 +36,7 @@ const char *
ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = {
UCC_TL_CUDA_ALLGATHER_DEFAULT_ALG_SELECT_STR,
UCC_TL_CUDA_ALLGATHERV_DEFAULT_ALG_SELECT_STR,
UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR,
UCC_TL_CUDA_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR,
UCC_TL_CUDA_REDUCE_SCATTERV_DEFAULT_ALG_SELECT_STR};

Expand Down Expand Up @@ -78,6 +80,8 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args,
return ucc_tl_cuda_allgather_init(coll_args, team, task_h);
case UCC_COLL_TYPE_ALLGATHERV:
return ucc_tl_cuda_allgatherv_init(coll_args, team, task_h);
case UCC_COLL_TYPE_BCAST:
return ucc_tl_cuda_bcast_init(coll_args, team, task_h);
case UCC_COLL_TYPE_REDUCE_SCATTER:
return ucc_tl_cuda_reduce_scatter_init(coll_args, team, task_h);
case UCC_COLL_TYPE_REDUCE_SCATTERV:
Expand Down Expand Up @@ -134,6 +138,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str)
return ucc_tl_cuda_allgather_alg_from_str(str);
case UCC_COLL_TYPE_ALLGATHERV:
return ucc_tl_cuda_allgatherv_alg_from_str(str);
case UCC_COLL_TYPE_BCAST:
return ucc_tl_cuda_bcast_alg_from_str(str);
default:
break;
}
Expand Down Expand Up @@ -187,6 +193,16 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str,
break;
};
break;
case UCC_COLL_TYPE_BCAST:
switch (alg_id) {
case UCC_TL_CUDA_BCAST_ALG_LINEAR:
*init = ucc_tl_cuda_bcast_linear_init;
break;
default:
status = UCC_ERR_INVALID_PARAM;
break;
};
break;
case UCC_COLL_TYPE_REDUCE_SCATTER:
switch (alg_id) {
case UCC_TL_CUDA_REDUCE_SCATTER_ALG_AUTO:
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/cuda/tl_cuda_coll.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "tl_cuda.h"
#include "components/mc/ucc_mc.h"

#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4
#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 5
extern const char
*ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR];

Expand Down

0 comments on commit a67cacd

Please sign in to comment.