Skip to content

Commit 59f43f0

Browse files
committed
enh: Select platform optimizations at runtime
Replace compile-time AWS platform detection with runtime EFA device detection using hwloc. This allows a single binary to work on both AWS and non-AWS environments, automatically enabling optimizations when EFA hardware is present. Removes autotools platform checks and always builds AWS platform code. Signed-off-by: Hershel Shah <[email protected]>
1 parent 3ac9392 commit 59f43f0

15 files changed

+142
-105
lines changed

configure.ac

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,6 @@ CHECK_PKG_HWLOC([],
137137
CHECK_PKG_VALGRIND()
138138
CHECK_VAR_REDZONE()
139139

140-
NCCL_OFI_PLATFORM="none"
141-
AS_IF([test "${NCCL_OFI_PLATFORM}" = "none"], [AX_CHECK_PLATFORM_AWS()])
142-
143140
AS_IF([test "${valgrind_enabled}" = "1" -a "${enable_asan}" = "yes"],
144141
[AC_MSG_ERROR([Enabling ASAN and valgrind at the same time is not permitted])])
145142

@@ -278,5 +275,5 @@ AC_OUTPUT
278275
echo "*"
279276
echo "* AWS OFI NCCL plugin has been configured."
280277
echo "*"
281-
echo "* Platform-specific optimizations: ${NCCL_OFI_PLATFORM}"
278+
echo "* Platform optimizations: AWS, Default"
282279
echo "*"

include/nccl_ofi_param.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,4 +374,10 @@ OFI_NCCL_PARAM_UINT(cm_num_rx_buffers, "CM_NUM_RX_BUFFERS", 32);
374374
OFI_NCCL_PARAM_VALUE_SET(PROGRESS_MODEL, (UNSPEC)(AUTO)(MANUAL))
375375
OFI_NCCL_PARAM(PROGRESS_MODEL, progress_model, "PROGRESS_MODEL", PROGRESS_MODEL::UNSPEC)
376376

377+
/*
378+
* Override platform selection. Valid options: AWS, DEFAULT, or AUTO for auto-detection.
379+
*/
380+
OFI_NCCL_PARAM_VALUE_SET(PLATFORM, (AUTO)(AWS)(DEFAULT))
381+
OFI_NCCL_PARAM(PLATFORM, platform, "PLATFORM", PLATFORM::AUTO);
382+
377383
#endif // End NCCL_OFI_PARAM_H_

include/nccl_ofi_platform.h

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -135,24 +135,21 @@ class PlatformManager {
135135
*/
136136
static PlatformManager& get_global();
137137

138-
/**
139-
* @brief Register a platform with the manager
140-
*
141-
* Platforms are automatically sorted by priority in the internal map.
142-
* Higher priority values take precedence and duplicates are dropped.
143-
*
144-
* @param platform Platform instance to register (moved)
145-
*/
146-
void register_platform(PlatformPtr&& platform);
147-
148138
/**
149139
* @brief Get the highest priority platform instance
150140
*
151141
* Returns the platform with the highest priority value.
152142
*
153143
* @return Reference to highest priority platform
154144
*/
155-
inline Platform& get_platform() { return *platforms_.rbegin()->second; }
145+
inline Platform& get_platform() {
146+
// std::map sorts by priority key; rbegin() gives highest priority
147+
auto it = platforms_.rbegin();
148+
Platform& selected = *it->second;
149+
NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Selected platform: %s with priority %d",
150+
selected.get_name(), it->first);
151+
return selected;
152+
}
156153

157154
/**
158155
* @brief Get number of registered platforms (for testing)
@@ -167,9 +164,21 @@ class PlatformManager {
167164
* instance is meant to be used in the plugin and the unit
168165
* tests leverage the protected scope.
169166
*/
170-
PlatformManager() {
171-
register_platform(std::make_unique<Default>());
172-
}
167+
PlatformManager();
168+
169+
/**
170+
* @brief Register a platform with the manager
171+
*
172+
* Platforms are automatically sorted by priority in the internal map.
173+
* Higher priority values take precedence and duplicates are dropped.
174+
* This can only be done in the constructor as all platform must be
175+
* added on the created of this object to allow the tuner and the plugin
176+
* to operate consistently.
177+
*
178+
* @param platform Platform instance to register (moved)
179+
*/
180+
void register_platform(PlatformPtr&& platform);
181+
173182

174183
private:
175184
std::map<int, PlatformPtr> platforms_;

include/nccl_ofi_topo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,4 +310,12 @@ struct fi_info *nccl_ofi_topo_next_info_list(nccl_ofi_topo_data_iterator_t *iter
310310
*/
311311
int nccl_ofi_topo_write_nccl_topology(nccl_ofi_topo_t *topo, FILE *file);
312312

313+
/*
314+
* @brief Check if EFA or ENA devices are present in the system
315+
*
316+
* @return true, if EFA or ENA devices are detected
317+
* false, otherwise
318+
*/
319+
bool nccl_ofi_topo_has_efa_ena_devices();
320+
313321
#endif // End NCCL_NET_OFI_TOPO_H_

include/platform-aws.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
class PlatformAWS : public Platform {
2424
public:
2525
const char* get_name() const override { return "AWS"; }
26-
int get_priority() override { return 100; }
26+
int get_priority() override { return PLATFORM_PRIORITY * (is_ec2_instance() ? 1 : -1); }
2727
int init(const char **provider_filter) override;
2828
int config_endpoint(struct fi_info *info, struct fid_ep *ep) override;
2929
void sort_rails(struct fi_info **info_list, size_t num_rails, size_t num_groups) override;
@@ -72,7 +72,13 @@ class PlatformAWS : public Platform {
7272
return fields ? fields->func_idx : -EIO;
7373
}
7474

75+
// Determine if running on Amazon EC2 instance
76+
bool is_ec2_instance();
77+
7578
private:
79+
// Constants
80+
static constexpr int PLATFORM_PRIORITY = 100;
81+
7682
std::mutex mutex_;
7783

7884
// Cache for GUID fields to avoid repeated sysfs reads

m4/ax_platform_aws.m4

Lines changed: 0 additions & 38 deletions
This file was deleted.

src/Makefile.am

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,9 @@ sources = \
3434
nccl_ofi_ep_addr_list.cpp \
3535
nccl_ofi_param.cpp \
3636
nccl_ofi_platform.cpp \
37+
platform-aws.cpp \
3738
tracepoint.cpp
3839

39-
if WANT_PLATFORM_AWS
40-
sources += platform-aws.cpp
41-
endif
42-
4340
if ENABLE_NEURON
4441
sources += nccl_ofi_interface_neuron.cpp
4542
else
@@ -48,13 +45,11 @@ else
4845
nccl_ofi_interface_nvidia.cpp
4946

5047
# add the tuner sources into the library
51-
if WANT_PLATFORM_AWS
52-
sources += \
48+
sources += \
5349
tuner/nccl_ofi_regions.cpp \
5450
tuner/nccl_ofi_tuner.cpp \
5551
tuner/nccl_ofi_model.cpp
5652
endif
57-
endif
5853

5954
# Build an internal-only library that can be used by unit tests as
6055
# well as the actual nccl_net.so / nccom_net.so libraries. This saves
@@ -104,20 +99,19 @@ if ENABLE_NCCL_NET_LIBRARY
10499
libnccl_net_la_LIBTOOLFLAGS = --tag=CXX
105100
libnccl_net_la_LDFLAGS = -module -avoid-version
106101
endif
107-
if WANT_PLATFORM_AWS
102+
108103
# NCCL standardized on the libnccl-tuner-<interface> format after we released a
109104
# plugin with the tuner named libnccl-ofi-tuner.so. Create separate libraries
110105
# for each name.
111-
lib_LTLIBRARIES += libnccl-ofi-tuner.la libnccl-tuner-ofi.la
112-
libnccl_ofi_tuner_la_SOURCES =
113-
libnccl_ofi_tuner_la_LIBADD = libinternal_plugin.la
114-
libnccl_ofi_tuner_la_LIBTOOLFLAGS = --tag=CXX
115-
libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
106+
lib_LTLIBRARIES += libnccl-ofi-tuner.la libnccl-tuner-ofi.la
107+
libnccl_ofi_tuner_la_SOURCES =
108+
libnccl_ofi_tuner_la_LIBADD = libinternal_plugin.la
109+
libnccl_ofi_tuner_la_LIBTOOLFLAGS = --tag=CXX
110+
libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
116111

117-
libnccl_tuner_ofi_la_SOURCES =
118-
libnccl_tuner_ofi_la_LIBADD = libinternal_plugin.la
119-
libnccl_tuner_ofi_la_LIBTOOLFLAGS = --tag=CXX
120-
libnccl_tuner_ofi_la_LDFLAGS = -module -avoid-version
121-
endif
112+
libnccl_tuner_ofi_la_SOURCES =
113+
libnccl_tuner_ofi_la_LIBADD = libinternal_plugin.la
114+
libnccl_tuner_ofi_la_LIBTOOLFLAGS = --tag=CXX
115+
libnccl_tuner_ofi_la_LDFLAGS = -module -avoid-version
122116

123117
endif

src/nccl_ofi_net.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@
3030
#include "nccl_ofi_idpool.h"
3131
#include "nccl_ofi_dmabuf.h"
3232
#include "nccl_ofi_platform.h"
33-
#ifdef WANT_AWS_PLATFORM
34-
#include "platform-aws.h"
35-
#endif
3633
#include "nccl_ofi_ofiutils.h"
3734
#include "nccl_ofi_system.h"
3835

@@ -178,10 +175,6 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
178175
nic_dup_conns = ofi_nccl_nic_dup_conns();
179176
cq_read_count = ofi_nccl_cq_read_count();
180177

181-
#ifdef WANT_AWS_PLATFORM
182-
PlatformManager::get_global().register_platform(std::make_unique<PlatformAWS>());
183-
#endif
184-
185178
ret = PlatformManager::get_global().get_platform().init(&provider_filter);
186179
if (ret != 0)
187180
goto exit;

src/nccl_ofi_platform.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
*/
44

55
#include "nccl_ofi_platform.h"
6+
#include "platform-aws.h"
7+
8+
PlatformManager::PlatformManager() {
9+
register_platform(std::make_unique<Default>());
10+
register_platform(std::make_unique<PlatformAWS>());
11+
}
612

713
PlatformManager& PlatformManager::get_global() {
814
static PlatformManager manager;
@@ -21,6 +27,6 @@ void PlatformManager::register_platform(PlatformPtr&& platform) {
2127
// TODO: Add proper resolution mechanism for competing priorities
2228
priority++;
2329
}
24-
30+
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Adding %s platform with %d priority", name, priority);
2531
platforms_[priority] = std::move(platform);
2632
}

src/nccl_ofi_topo.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1722,3 +1722,29 @@ struct fi_info *nccl_ofi_topo_next_info_list(nccl_ofi_topo_data_iterator_t *iter
17221722

17231723
return info_list;
17241724
}
1725+
1726+
bool nccl_ofi_topo_has_efa_ena_devices()
1727+
{
1728+
hwloc_topology_t topo;
1729+
if (hwloc_topology_init(&topo) != 0) {
1730+
return false;
1731+
}
1732+
1733+
auto topology = std::shared_ptr<hwloc_topology>(topo, hwloc_topology_destroy);
1734+
enable_hwloc_io_types(topo);
1735+
1736+
if (hwloc_topology_load(topo) != 0) {
1737+
return false;
1738+
}
1739+
1740+
hwloc_obj_t obj = nullptr;
1741+
while ((obj = hwloc_get_next_pcidev(topo, obj)) != nullptr) {
1742+
// Check for Amazon vendor id and EFA device or ENA device
1743+
if (obj->attr->pcidev.vendor_id == 0x1D0F &&
1744+
(((obj->attr->pcidev.device_id & 0xFFF0) == 0xEFA0 || (obj->attr->pcidev.device_id & 0xFFF0) == 0xEC20) ||
1745+
(obj->attr->pcidev.device_id & 0x0FFF) == 0x0EC2)) {
1746+
return true;
1747+
}
1748+
}
1749+
return false;
1750+
}

0 commit comments

Comments
 (0)