Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/components/tl/cuda/allgather/allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p);
Expand Down
6 changes: 6 additions & 0 deletions src/components/tl/cuda/allgatherv/allgatherv.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t ** task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p);
Expand Down
5 changes: 5 additions & 0 deletions src/components/tl/cuda/reduce_scatter/reduce_scatter.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,12 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_reduce_scatter_linear_init(coll_args, tl_team,
task_p);
Expand Down
5 changes: 5 additions & 0 deletions src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ ucc_status_t ucc_tl_cuda_reduce_scatterv_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_reduce_scatterv_linear_init(coll_args, tl_team,
task_p);
Expand Down
5 changes: 5 additions & 0 deletions src/components/tl/cuda/tl_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ static ucs_config_field_t ucc_tl_cuda_context_config_table[] = {
{"", "", NULL, ucc_offsetof(ucc_tl_cuda_context_config_t, super),
UCC_CONFIG_TYPE_TABLE(ucc_tl_context_config_table)},

{"LAZY_INIT", "yes",
"Initialize team on first collective",
ucc_offsetof(ucc_tl_cuda_context_config_t, lazy_init),
UCC_CONFIG_TYPE_BOOL},

{NULL}};

ucc_status_t ucc_tl_cuda_get_context_attr(const ucc_base_context_t *context,
Expand Down
47 changes: 33 additions & 14 deletions src/components/tl/cuda/tl_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@
(ucc_tl_cuda_shm_barrier_t *)_bar; \
})

#define GET_RANK_ID(_ids, _trank, _max_concurrent) \
({ \
size_t _rank_id_size = sizeof(ucc_tl_cuda_rank_id_t) + \
(_max_concurrent - 1) * sizeof(cudaIpcEventHandle_t); \
void *_rank_id = PTR_OFFSET(_ids, _trank * _rank_id_size); \
(ucc_tl_cuda_rank_id_t*)_rank_id; \
})

#ifdef HAVE_PROFILING_TL_CUDA
#include "utils/profile/ucc_profile.h"
#else
Expand Down Expand Up @@ -81,6 +89,7 @@ typedef struct ucc_tl_cuda_lib_config {

typedef struct ucc_tl_cuda_context_config {
ucc_tl_context_config_t super;
int lazy_init;
} ucc_tl_cuda_context_config_t;

typedef struct ucc_tl_cuda_lib {
Expand All @@ -93,8 +102,6 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_lib_t, const ucc_base_lib_params_t *,
typedef struct ucc_tl_cuda_context {
ucc_tl_context_t super;
ucc_tl_cuda_context_config_t cfg;
int device;
ucc_tl_cuda_device_pci_id_t device_id;
ucc_tl_cuda_topo_t *topo;
ucc_mpool_t req_mp;
tl_cuda_ep_hash_t *ipc_cache;
Expand Down Expand Up @@ -127,14 +134,14 @@ typedef struct ucc_tl_cuda_rank_id {
ucc_tl_cuda_device_pci_id_t pci_id;
ucc_tl_cuda_mem_info_t scratch_info;
int shm;
cudaIpcEventHandle_t ev_handle[1]; /* max concurent */
} ucc_tl_cuda_rank_id_t;

typedef struct ucc_tl_cuda_sync {
int seq_num[UCC_TL_CUDA_MAX_RING_CHUNKS];
ucc_tl_cuda_mem_info_t mem_info_src;
ucc_tl_cuda_mem_info_t mem_info_dst;
cudaEvent_t ipc_event_local;
cudaIpcEventHandle_t ev_handle;
union {
struct {
size_t sbytes[UCC_TL_CUDA_MAX_PEERS];
Expand All @@ -152,20 +159,32 @@ typedef struct ucc_tl_cuda_scratch {
ucc_tl_cuda_mem_info_t rem_info[UCC_TL_CUDA_MAX_PEERS];
} ucc_tl_cuda_scratch_t;

enum {
TL_CUDA_STATE_READY,
TL_CUDA_STATE_SHM_ID_EXCHANGE,
TL_CUDA_STATE_COMM_INIT,
TL_CUDA_STATE_ERROR
};

typedef struct ucc_tl_cuda_team {
ucc_tl_team_t super;
uint32_t seq_num;
ucc_tl_cuda_team_topo_t *topo;
ucc_tl_cuda_sync_t *sync;
ucc_tl_cuda_sync_state_t *sync_state;
ucc_tl_cuda_shm_barrier_t *bar;
ucc_tl_cuda_scratch_t scratch;
cudaStream_t stream;
ucc_tl_cuda_rank_id_t *ids;
ucc_team_oob_coll_t oob;
void *oob_req;
ucc_tl_team_t super;
int state;
uint32_t seq_num;
int device;
ucc_tl_cuda_device_pci_id_t device_id;
ucc_tl_cuda_team_topo_t *topo;
ucc_tl_cuda_sync_t *sync;
ucc_tl_cuda_sync_state_t *sync_state;
ucc_tl_cuda_shm_barrier_t *bar;
ucc_tl_cuda_scratch_t scratch;
cudaStream_t stream;
ucc_tl_cuda_rank_id_t *ids;
ucc_team_oob_coll_t oob;
void *oob_req;
} ucc_tl_cuda_team_t;

ucc_status_t ucc_tl_cuda_comm_init(ucc_tl_cuda_team_t *team);

UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *,
const ucc_base_team_params_t *);

Expand Down
20 changes: 1 addition & 19 deletions src/components/tl/cuda/tl_cuda_coll.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -12,24 +12,6 @@
#include "reduce_scatter/reduce_scatter.h"
#include "reduce_scatterv/reduce_scatterv.h"
#include "utils/arch/cpu.h"
#include "utils/arch/cuda_def.h"


#if ENABLE_DEBUG == 1
/* TODO: possible need to check CUDA context */
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team) do { \
int _dev; \
CUDA_CHECK(cudaGetDevice(&_dev)); \
if (_dev != UCC_TL_CUDA_TEAM_CTX(_team)->device) { \
tl_error(UCC_TL_TEAM_LIB(_team), "CUDA device mismatch, " \
"current device %d, team device %d\n", _dev, \
UCC_TL_CUDA_TEAM_CTX(_team)->device); \
return UCC_ERR_INVALID_PARAM; \
} \
} while(0)
#else
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team)
#endif

const char *
ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = {
Expand Down
24 changes: 23 additions & 1 deletion src/components/tl/cuda/tl_cuda_coll.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -9,11 +9,28 @@

#include "tl_cuda.h"
#include "components/mc/ucc_mc.h"
#include "utils/arch/cuda_def.h"

#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4
extern const char
*ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR];

#if ENABLE_DEBUG == 1
/* TODO: possible need to check CUDA context */
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team) do { \
int _dev; \
CUDA_CHECK(cudaGetDevice(&_dev)); \
if (((_team)->device != -1) && _dev != (_team)->device) { \
tl_error(UCC_TL_TEAM_LIB(_team), "CUDA device mismatch, " \
"current device %d, team device %d\n", _dev, \
(_team)->device); \
return UCC_ERR_INVALID_PARAM; \
} \
} while(0)
#else
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team)
#endif

#define TASK_TEAM(_task) \
(ucc_derived_of((_task)->super.team, ucc_tl_cuda_team_t))

Expand Down Expand Up @@ -85,6 +102,11 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
UCC_TL_CUDA_CHECK_DEVICE_MATCH(team);
if (!ucc_coll_args_is_predefined_dt(&coll_args->args, trank)) {
return UCC_ERR_NOT_SUPPORTED;
}
Expand Down
15 changes: 0 additions & 15 deletions src/components/tl/cuda/tl_cuda_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
ucc_status_t status;
int num_devices;
cudaError_t cuda_st;
CUcontext cu_ctx;
CUresult cu_st;

UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_cuda_config->super,
params->context);
Expand All @@ -37,13 +35,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
return UCC_ERR_NO_RESOURCE;
}

cu_st = cuCtxGetCurrent(&cu_ctx);
if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) {
tl_debug(self->super.super.lib,
"cannot create CUDA TL context without active CUDA context");
return UCC_ERR_NO_RESOURCE;
}

status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_cuda_task_t), 0,
UCC_CACHE_LINE_SIZE, 8, UINT_MAX,
&ucc_coll_task_mpool_ops, params->thread_mode,
Expand All @@ -54,18 +45,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
return status;
}

CUDA_CHECK_GOTO(cudaGetDevice(&self->device), free_mpool, status);
status = ucc_tl_cuda_topo_create(self->super.super.lib, &self->topo);
if (status != UCC_OK) {
tl_error(self->super.super.lib,
"failed to initialize tl_cuda_topo");
goto free_mpool;
}
status = ucc_tl_cuda_topo_get_pci_id(self->device, &self->device_id);
if (status != UCC_OK) {
tl_error(self->super.super.lib, "failed to get pci id");
goto free_mpool;
}

self->ipc_cache = kh_init(tl_cuda_ep_hash);
tl_debug(self->super.super.lib, "initialized tl context: %p", self);
Expand Down
Loading