Skip to content

Commit d47c0b7

Browse files
committed
DCGM 4.1.1 (#213)
Signed-off-by: Nik Konyuchenko <[email protected]>
1 parent 37b325d commit d47c0b7

19 files changed

+361
-79
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
cmake_minimum_required(VERSION 3.21.0)
1616

17-
project(datacenter-gpu-manager-4 VERSION 4.1.0)
17+
project(datacenter-gpu-manager-4 VERSION 4.1.1)
1818

1919
set(CMAKE_BUILD_WITH_INSTALL_RPATH OFF CACHE BOOL
2020
"Specify whether to link the target in the build tree with the INSTALL_RPATH")

cmake/cpack-deb-prebuild.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#[[
2-
Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights
2+
Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
33
reserved.
44
55
Licensed under the Apache License, Version 2.0 (the "License");

cmake/cpack-rpm-prebuild.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#[[
2-
Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights
2+
Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
33
reserved.
44
55
Licensed under the Apache License, Version 2.0 (the "License");

dcgm_config/CMakeLists.txt.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ project(datacenter-gpu-manager-4-config
2525
set(DCGM_ROOT_DIR "@CMAKE_SOURCE_DIR@")
2626

2727
set(DCGM_NVVS_SRC_DIR "${DCGM_ROOT_DIR}/nvvs")
28-
set(DCGM_NVVS_INSTALL_DIR "@CMAKE_INSTALL_LIBEXECDIR@/@PROJECT_NAME@")
28+
set(DCGM_NVVS_INSTALL_DIR "@CMAKE_INSTALL_DATAROOTDIR@/@PROJECT_NAME@")
2929

3030
configure_file("${DCGM_NVVS_SRC_DIR}/diag-skus.yaml.in" diag-skus.yaml @ONLY)
3131

dcgmlib/src/DcgmCacheManager.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3076,7 +3076,7 @@ dcgmReturn_t DcgmCacheManager::UpdateWatchFromWatchers(dcgmcm_watch_info_p watch
30763076
for (++it; it != watchInfo->watchers.end(); ++it)
30773077
{
30783078
minMonitorFreqUsec = std::min(minMonitorFreqUsec, it->monitorIntervalUsec);
3079-
minMaxAgeUsec = std::min(minMaxAgeUsec, it->maxAgeUsec);
3079+
minMaxAgeUsec = std::max(minMaxAgeUsec, it->maxAgeUsec);
30803080
if (it->isSubscribed)
30813081
hasSubscribedWatchers = 1;
30823082
}
@@ -3085,7 +3085,10 @@ dcgmReturn_t DcgmCacheManager::UpdateWatchFromWatchers(dcgmcm_watch_info_p watch
30853085
watchInfo->maxAgeUsec = minMaxAgeUsec;
30863086
watchInfo->hasSubscribedWatchers = hasSubscribedWatchers;
30873087

3088-
log_debug("UpdateWatchFromWatchers minMonitorFreqUsec {}, minMaxAgeUsec {}, hsw {}",
3088+
log_debug("UpdateWatchFromWatchers gid {}, eid {}, fid {}, minMonitorFreqUsec {}, minMaxAgeUsec {}, hsw {}",
3089+
watchInfo->watchKey.entityGroupId,
3090+
watchInfo->watchKey.entityId,
3091+
watchInfo->watchKey.fieldId,
30893092
(long long)minMonitorFreqUsec,
30903093
(long long)minMaxAgeUsec,
30913094
watchInfo->hasSubscribedWatchers);

dcgmlib/src/DcgmCacheManager.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,10 @@ class DcgmCacheManager : public DcgmTaskRunner
17021702
timelib64_t now,
17031703
timelib64_t expireTime);
17041704

1705+
#ifndef TEST_DCGMCACHEMANAGER
17051706
private:
1707+
#endif
1708+
17061709
int m_pollInLockStep; /* Whether to poll when told to (1) or at the
17071710
frequency of the most frequent stat being tracked (0) */
17081711

intodocker.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
set -o errexit -o pipefail -o noclobber -o nounset
1818

19-
if [[ ${DEBUG_BUILD_SCRIPT:-0} -eq 1 ]]; then
19+
if [[ ${DEBUG_BUILD_SCRIPT:-0} -eq 1 ]]
20+
then
2021
PS4='$LINENO: ' # to see line numbers
2122
set -xv
2223
fi

modules/nvswitch/FieldDefinitions.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ class FieldIdControlType<DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS> : public FieldIdCont
232232

233233
UpdateFuncType UpdateFunc(void) const override
234234
{
235-
return UpdateNvSwitchVectorFieldType<fieldId>::updateFunc;
235+
return UpdateNvSwitchScalarFieldType<fieldId>::updateFunc;
236236
}
237237
};
238238

@@ -254,7 +254,7 @@ class FieldIdControlType<DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS> : public FieldId
254254

255255
UpdateFuncType UpdateFunc(void) const override
256256
{
257-
return UpdateNvSwitchVectorFieldType<fieldId>::updateFunc;
257+
return UpdateNvSwitchScalarFieldType<fieldId>::updateFunc;
258258
}
259259
};
260260

modules/sysmon/DcgmModuleSysmon.cpp

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,36 +1096,42 @@ void DcgmModuleSysmon::PopulateTemperatureFileMap()
10961096
}
10971097
};
10981098

1099-
std::unique_ptr<DIR, decltype(dirDeleter)> dir(opendir(THERMAL_BASE_PATH.c_str()), dirDeleter);
1100-
struct dirent *entry {};
1099+
auto dir = std::unique_ptr<DIR, decltype(dirDeleter)>(opendir(THERMAL_BASE_PATH.c_str()), dirDeleter);
1100+
if (!dir)
1101+
{
1102+
auto syserr = std::system_error(errno, std::generic_category());
1103+
log_info("Could not open directory '{}'", THERMAL_BASE_PATH);
1104+
log_debug("Got opendir error: ({}) {}", syserr.code().value(), syserr.what());
1105+
return;
1106+
}
1107+
1108+
struct dirent *entry = nullptr;
11011109

11021110
while ((entry = readdir(dir.get())) != nullptr)
11031111
{
1104-
if (strncmp(entry->d_name, THERMAL_DIR_NAME_START.c_str(), THERMAL_DIR_NAME_START.size()))
1112+
if (entry->d_type != DT_DIR || !std::string_view { entry->d_name }.starts_with(THERMAL_DIR_NAME_START))
11051113
{
11061114
// Not a socket temperature match candidate
11071115
log_verbose(
11081116
"Ignoring directory '{}' among thermal zone candidates in path {}", entry->d_type, THERMAL_BASE_PATH);
11091117
continue;
11101118
}
11111119

1112-
std::string path(fmt::format("{}/{}/{}", THERMAL_BASE_PATH, entry->d_name, THERMAL_PATH_EXTENSION));
1120+
auto pathPrefix = fmt::format("{}/{}", THERMAL_BASE_PATH, entry->d_name);
1121+
1122+
auto path = fmt::format("{}/{}", pathPrefix, THERMAL_PATH_EXTENSION);
1123+
11131124
unsigned int socketId = GetSocketIdFromThermalFile(path);
11141125
if (socketId != SYSMON_INVALID_SOCKET_ID)
11151126
{
1116-
std::string tempFilePath(
1117-
fmt::format("{}/{}/{}", THERMAL_BASE_PATH, entry->d_name, THERMAL_TEMPERATURE_FILENAME));
1127+
auto tempFilePath = fmt::format("{}/{}", pathPrefix, THERMAL_TEMPERATURE_FILENAME);
11181128
log_debug("Recording temperature path '{}' for Socket {}", tempFilePath, socketId);
11191129
m_socketTemperatureFileMap[socketId] = std::move(tempFilePath);
11201130

1121-
std::string trip0TypePath(
1122-
fmt::format("{}/{}/{}", THERMAL_BASE_PATH, entry->d_name, THERMAL_TEMPERATURE_TRIPTYPE0_FILENAME));
1123-
std::string trip1TypePath(
1124-
fmt::format("{}/{}/{}", THERMAL_BASE_PATH, entry->d_name, THERMAL_TEMPERATURE_TRIPTYPE1_FILENAME));
1125-
std::string trip0TempPath(
1126-
fmt::format("{}/{}/{}", THERMAL_BASE_PATH, entry->d_name, THERMAL_TEMPERATURE_TRIPTEMP0_FILENAME));
1127-
std::string trip1TempPath(
1128-
fmt::format("{}/{}/{}", THERMAL_BASE_PATH, entry->d_name, THERMAL_TEMPERATURE_TRIPTEMP1_FILENAME));
1131+
auto trip0TypePath = fmt::format("{}/{}", pathPrefix, THERMAL_TEMPERATURE_TRIPTYPE0_FILENAME);
1132+
auto trip1TypePath = fmt::format("{}/{}", pathPrefix, THERMAL_TEMPERATURE_TRIPTYPE1_FILENAME);
1133+
auto trip0TempPath = fmt::format("{}/{}", pathPrefix, THERMAL_TEMPERATURE_TRIPTEMP0_FILENAME);
1134+
auto trip1TempPath = fmt::format("{}/{}", pathPrefix, THERMAL_TEMPERATURE_TRIPTEMP1_FILENAME);
11291135
std::ifstream type0File(trip0TypePath);
11301136
std::ifstream type1File(trip1TypePath);
11311137
auto type0Type = readEntireFile(type0File);

modules/sysmon/DcgmSystemMonitor.cpp

Lines changed: 67 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@
1414
* limitations under the License.
1515
*/
1616

17+
#include <cerrno>
1718
#include <cstring>
1819
#include <dirent.h>
1920
#include <fstream>
2021
#include <iostream>
21-
#include <stdio.h>
22+
#include <system_error>
2223

2324
#include <DcgmLogging.h>
2425
#include <DcgmStringHelpers.h>
@@ -105,7 +106,7 @@ dcgmPowerFileInfo_t DcgmSystemMonitor::GetCpuSocketFileIndex(const std::string &
105106

106107
void DcgmSystemMonitor::PopulateSocketPowerMap(const std::string &baseDir)
107108
{
108-
static const std::string POWER_PATH_EXTENSION("device/");
109+
static const std::string POWER_PATH_EXTENSION("device");
109110
static const std::string HWMON_DIR_NAME_START("hwmon");
110111
static const std::string POWER_USAGE_FILENAME("power1_average");
111112
static const std::string POWER_CAP_FILENAME("power1_cap");
@@ -117,59 +118,77 @@ void DcgmSystemMonitor::PopulateSocketPowerMap(const std::string &baseDir)
117118
return;
118119
}
119120

120-
DIR *dir = opendir(baseDir.c_str());
121-
struct dirent *entry;
121+
auto dirDeleter = [](DIR *dir) {
122+
if (dir != nullptr)
123+
{
124+
closedir(dir);
125+
dir = nullptr;
126+
}
127+
};
128+
129+
auto dir = std::unique_ptr<DIR, decltype(dirDeleter)>(opendir(baseDir.c_str()), dirDeleter);
130+
if (!dir)
131+
{
132+
auto syserr = std::system_error(errno, std::generic_category());
133+
log_info("Could not open directory '{}'", baseDir);
134+
log_debug("Got opendir error: ({}) {}", syserr.code().value(), syserr.what());
135+
return;
136+
}
137+
138+
struct dirent *entry = nullptr;
122139

123-
while ((entry = readdir(dir)) != nullptr)
140+
while ((entry = readdir(dir.get())) != nullptr)
124141
{
125-
if (entry->d_type == DT_DIR
126-
|| !strncmp(entry->d_name, HWMON_DIR_NAME_START.c_str(), HWMON_DIR_NAME_START.size()))
142+
if (entry->d_type != DT_DIR || !std::string_view { entry->d_name }.starts_with(HWMON_DIR_NAME_START))
143+
{
144+
continue;
145+
}
146+
147+
auto pathPrefix = fmt::format("{}/{}/{}", baseDir, entry->d_name, POWER_PATH_EXTENSION);
148+
149+
std::string path = fmt::format("{}/{}", pathPrefix, POWER_OEM_INFO);
150+
151+
dcgmPowerFileInfo_t info = GetCpuSocketFileIndex(path);
152+
if (info.socketId == SYSMON_INVALID_SOCKET_ID)
153+
{
154+
log_debug("Invalid socket ID. Skipping: '{}'", entry->d_name);
155+
continue;
156+
}
157+
158+
if (info.fileType == DCGM_POWER_USAGE_FILE)
127159
{
128-
std::string path = fmt::format("{}/{}/{}{}", baseDir, entry->d_name, POWER_PATH_EXTENSION, POWER_OEM_INFO);
129-
dcgmPowerFileInfo_t info = GetCpuSocketFileIndex(path);
130-
if (info.socketId != SYSMON_INVALID_SOCKET_ID)
160+
// Something like: /sys/class/hwmon/hwmon4/device/power1_average
161+
auto usagePath = fmt::format("{}/{}", pathPrefix, POWER_USAGE_FILENAME);
162+
163+
if (info.fileSrc == DCGM_CPU_POWER_SOCKET_FILE)
164+
{
165+
m_cpuSocketToPowerUsagePath[info.socketId] = std::move(usagePath);
166+
}
167+
else if (info.fileSrc == DCGM_SYSIO_POWER_SOCKET_FILE)
168+
{
169+
m_sysioSocketToPowerUsagePath[info.socketId] = std::move(usagePath);
170+
}
171+
else if (info.fileSrc == DCGM_MODULE_POWER_SOCKET_FILE)
131172
{
132-
if (info.fileType == DCGM_POWER_USAGE_FILE)
133-
{
134-
if (info.fileSrc == DCGM_CPU_POWER_SOCKET_FILE)
135-
{
136-
// Something like: /sys/class/hwmon/hwmon4/device/power1_average
137-
m_cpuSocketToPowerUsagePath[info.socketId] = fmt::format(
138-
"{}/{}/{}{}", baseDir, entry->d_name, POWER_PATH_EXTENSION, POWER_USAGE_FILENAME);
139-
}
140-
else if (info.fileSrc == DCGM_SYSIO_POWER_SOCKET_FILE)
141-
{
142-
// Something like: /sys/class/hwmon/hwmon4/device/power1_average
143-
m_sysioSocketToPowerUsagePath[info.socketId] = fmt::format(
144-
"{}/{}/{}{}", baseDir, entry->d_name, POWER_PATH_EXTENSION, POWER_USAGE_FILENAME);
145-
}
146-
else if (info.fileSrc == DCGM_MODULE_POWER_SOCKET_FILE)
147-
{
148-
// Something like: /sys/class/hwmon/hwmon4/device/power1_average
149-
m_moduleSocketToPowerUsagePath[info.socketId] = fmt::format(
150-
"{}/{}/{}{}", baseDir, entry->d_name, POWER_PATH_EXTENSION, POWER_USAGE_FILENAME);
151-
}
152-
else
153-
{
154-
log_debug("File source invalid: '{}'", info.fileSrc);
155-
return;
156-
}
157-
}
158-
else if (info.fileType == DCGM_POWER_CAP_FILE)
159-
{
160-
// Something like: /sys/class/hwmon/hwmon3/device/power1_cap
161-
m_socketToPowerCapPath[info.socketId]
162-
= fmt::format("{}/{}/{}{}", baseDir, entry->d_name, POWER_PATH_EXTENSION, POWER_CAP_FILENAME);
163-
}
164-
else
165-
{
166-
log_debug("File type invalid: '{}'", info.fileType);
167-
return;
168-
}
173+
m_moduleSocketToPowerUsagePath[info.socketId] = std::move(usagePath);
169174
}
175+
else
176+
{
177+
log_debug("File source invalid: '{}'", info.fileSrc);
178+
return;
179+
}
180+
}
181+
else if (info.fileType == DCGM_POWER_CAP_FILE)
182+
{
183+
// Something like: /sys/class/hwmon/hwmon3/device/power1_cap
184+
m_socketToPowerCapPath[info.socketId] = fmt::format("{}/{}", pathPrefix, POWER_CAP_FILENAME);
185+
}
186+
else
187+
{
188+
log_debug("File type invalid: '{}'", info.fileType);
189+
return;
170190
}
171191
}
172-
closedir(dir);
173192
}
174193

175194
double DcgmSystemMonitor::GetPowerValueFromFile(const std::string &path)

0 commit comments

Comments
 (0)