Skip to content

Add resiliency to Resource Monitoring in Linux #6489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ internal interface ITcpStateInfoProvider
/// Gets the last known TCP/IP v4 state of the system.
/// </summary>
/// <returns>An instance of <see cref="TcpStateInfo"/>.</returns>
TcpStateInfo GetpIpV4TcpStateInfo();
TcpStateInfo GetIpV4TcpStateInfo();

/// <summary>
/// Gets the last known TCP/IP v6 state of the system.
/// </summary>
/// <returns>An instance of <see cref="TcpStateInfo"/>.</returns>
TcpStateInfo GetpIpV6TcpStateInfo();
TcpStateInfo GetIpV6TcpStateInfo();
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
using System.Linq;
using Microsoft.Extensions.ObjectPool;
using Microsoft.Shared.Pools;

Expand All @@ -23,7 +24,7 @@ internal sealed class DiskStatsReader(IFileSystem fileSystem) : IDiskStatsReader
/// Reads and returns all disk statistics entries.
/// </summary>
/// <returns>List of <see cref="DiskStats"/>.</returns>
public List<DiskStats> ReadAll()
public DiskStats[] ReadAll(string[] skipDevicePrefixes)
{
var diskStatsList = new List<DiskStats>();

Expand All @@ -41,7 +42,11 @@ public List<DiskStats> ReadAll()
try
{
DiskStats stat = DiskStatsReader.ParseLine(line);
diskStatsList.Add(stat);
if (!skipDevicePrefixes.Any(prefix =>
stat.DeviceName.StartsWith(prefix, StringComparison.OrdinalIgnoreCase)))
{
diskStatsList.Add(stat);
}
}
#pragma warning disable CA1031
catch (Exception)
Expand All @@ -51,7 +56,7 @@ public List<DiskStats> ReadAll()
}
}

return diskStatsList;
return diskStatsList.ToArray();
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;

namespace Microsoft.Extensions.Diagnostics.ResourceMonitoring.Linux.Disk;

/// <summary>
Expand All @@ -14,5 +12,5 @@ internal interface IDiskStatsReader
/// Gets all the disk statistics from the system.
/// </summary>
/// <returns>List of <see cref="DiskStats"/> instances.</returns>
List<DiskStats> ReadAll();
DiskStats[] ReadAll(string[] skipDevicePrefixes);
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Frozen;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.Metrics;
using System.Linq;
using System.IO;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
Expand All @@ -22,14 +23,22 @@ internal sealed class LinuxSystemDiskMetrics
private const string DeviceKey = "system.device";
private const string DirectionKey = "disk.io.direction";

// Exclude devices with these prefixes because they represent virtual, loopback, or device-mapper disks
// that do not correspond to real physical storage. Including them would distort system disk I/O metrics.
private static readonly string[] _skipDevicePrefixes = new[] { "ram", "loop", "dm-" };
private static readonly KeyValuePair<string, object?> _directionReadTag = new(DirectionKey, "read");
private static readonly KeyValuePair<string, object?> _directionWriteTag = new(DirectionKey, "write");
private readonly ILogger<LinuxSystemDiskMetrics> _logger;
private readonly TimeProvider _timeProvider;
private readonly IDiskStatsReader _diskStatsReader;
private readonly object _lock = new();
private readonly Dictionary<string, DiskStats> _baselineDiskStatsDict = [];
private List<DiskStats> _diskStatsSnapshot = [];
private readonly FrozenDictionary<string, DiskStats> _baselineDiskStatsDict = FrozenDictionary<string, DiskStats>.Empty;
private readonly TimeSpan _retryInterval = TimeSpan.FromMinutes(5);

private DateTimeOffset _lastDiskStatsFailure = DateTimeOffset.MinValue;
private bool _diskStatsUnavailable;

private DiskStats[] _diskStatsSnapshot = [];
private DateTimeOffset _lastRefreshTime = DateTimeOffset.MinValue;

public LinuxSystemDiskMetrics(
Expand All @@ -48,7 +57,7 @@ public LinuxSystemDiskMetrics(
}

// We need to read the disk stats once to get the baseline values
_baselineDiskStatsDict = GetAllDiskStats().ToDictionary(d => d.DeviceName);
_baselineDiskStatsDict = GetAllDiskStats().ToFrozenDictionary(d => d.DeviceName);

#pragma warning disable CA2000 // Dispose objects before losing scope
// We don't dispose the meter because IMeterFactory handles that
Expand Down Expand Up @@ -85,7 +94,7 @@ public LinuxSystemDiskMetrics(
private IEnumerable<Measurement<long>> GetDiskIoMeasurements()
{
List<Measurement<long>> measurements = [];
List<DiskStats> diskStatsSnapshot = GetDiskStatsSnapshot();
DiskStats[] diskStatsSnapshot = GetDiskStatsSnapshot();

foreach (DiskStats diskStats in diskStatsSnapshot)
{
Expand All @@ -102,7 +111,7 @@ private IEnumerable<Measurement<long>> GetDiskIoMeasurements()
private IEnumerable<Measurement<long>> GetDiskOperationMeasurements()
{
List<Measurement<long>> measurements = [];
List<DiskStats> diskStatsSnapshot = GetDiskStatsSnapshot();
DiskStats[] diskStatsSnapshot = GetDiskStatsSnapshot();

foreach (DiskStats diskStats in diskStatsSnapshot)
{
Expand All @@ -119,7 +128,7 @@ private IEnumerable<Measurement<long>> GetDiskOperationMeasurements()
private IEnumerable<Measurement<double>> GetDiskIoTimeMeasurements()
{
List<Measurement<double>> measurements = [];
List<DiskStats> diskStatsSnapshot = GetDiskStatsSnapshot();
DiskStats[] diskStatsSnapshot = GetDiskStatsSnapshot();

foreach (DiskStats diskStats in diskStatsSnapshot)
{
Expand All @@ -131,12 +140,12 @@ private IEnumerable<Measurement<double>> GetDiskIoTimeMeasurements()
return measurements;
}

private List<DiskStats> GetDiskStatsSnapshot()
private DiskStats[] GetDiskStatsSnapshot()
{
lock (_lock)
{
DateTimeOffset now = _timeProvider.GetUtcNow();
if (_diskStatsSnapshot.Count == 0 || (now - _lastRefreshTime).TotalSeconds > MinimumDiskStatsRefreshIntervalInSeconds)
if (_diskStatsSnapshot.Length == 0 || (now - _lastRefreshTime).TotalSeconds > MinimumDiskStatsRefreshIntervalInSeconds)
{
_diskStatsSnapshot = GetAllDiskStats();
_lastRefreshTime = now;
Expand All @@ -146,27 +155,37 @@ private List<DiskStats> GetDiskStatsSnapshot()
return _diskStatsSnapshot;
}

private List<DiskStats> GetAllDiskStats()
private DiskStats[] GetAllDiskStats()
{
if (_diskStatsUnavailable &&
_timeProvider.GetUtcNow() - _lastDiskStatsFailure < _retryInterval)
{
return Array.Empty<DiskStats>();
}

try
{
List<DiskStats> diskStatsList = _diskStatsReader.ReadAll();

// We should not include ram, loop, or dm(device-mapper) devices in the disk stats, should we?
diskStatsList = diskStatsList
.Where(d => !d.DeviceName.StartsWith("ram", StringComparison.OrdinalIgnoreCase)
&& !d.DeviceName.StartsWith("loop", StringComparison.OrdinalIgnoreCase)
&& !d.DeviceName.StartsWith("dm-", StringComparison.OrdinalIgnoreCase))
.ToList();
DiskStats[] diskStatsList = _diskStatsReader.ReadAll(_skipDevicePrefixes);
_diskStatsUnavailable = false;

return diskStatsList;
}
catch (Exception ex) when (
ex is FileNotFoundException ||
ex is DirectoryNotFoundException ||
ex is UnauthorizedAccessException)
{
_logger.HandleDiskStatsException(ex.Message);
_lastDiskStatsFailure = _timeProvider.GetUtcNow();
_diskStatsUnavailable = true;
}
#pragma warning disable CA1031
catch (Exception ex)
#pragma warning restore CA1031
{
Log.HandleDiskStatsException(_logger, ex.Message);
_logger.HandleDiskStatsException(ex.Message);
}

return [];
return Array.Empty<DiskStats>();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
// _memoryLimit - Resource Memory Limit (in k8s terms)
// _memoryLimit - To keep the contract, this parameter will get the Host available memory
Resources = new SystemResources(cpuRequest, cpuLimit, _memoryLimit, _memoryLimit);
Log.SystemResourcesInfo(_logger, cpuLimit, cpuRequest, _memoryLimit, _memoryLimit);
_logger.SystemResourcesInfo(cpuLimit, cpuRequest, _memoryLimit, _memoryLimit);
}

public double CpuUtilizationWithoutHostDelta()
Expand Down Expand Up @@ -144,7 +144,7 @@ public double CpuUtilizationWithoutHostDelta()
{
coresUsed = deltaCgroup / (double)deltaCpuPeriodInNanoseconds;

Log.CpuUsageDataV2(_logger, cpuUsageTime, _previousCgroupCpuTime, deltaCpuPeriodInNanoseconds, coresUsed);
_logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, deltaCpuPeriodInNanoseconds, coresUsed);

_lastCpuCoresUsed = coresUsed;
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
Expand All @@ -158,7 +158,7 @@ public double CpuUtilizationWithoutHostDelta()
{
coresUsed = deltaCgroup / actualElapsedNanoseconds;

Log.CpuUsageDataV2(_logger, cpuUsageTime, _previousCgroupCpuTime, actualElapsedNanoseconds, coresUsed);
_logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, actualElapsedNanoseconds, coresUsed);

_lastCpuCoresUsed = coresUsed;
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
Expand Down Expand Up @@ -188,15 +188,15 @@ public double CpuUtilizationLimit(float cpuLimit)
{
_cpuUtilizationLimit100PercentExceededCounter?.Add(1);
_cpuUtilizationLimit100PercentExceeded++;
Log.CounterMessage100(_logger, _cpuUtilizationLimit100PercentExceeded);
_logger.CounterMessage100(_cpuUtilizationLimit100PercentExceeded);
}

// Increment counter if utilization exceeds 110%
if (utilization > CpuLimitThreshold110Percent)
{
_cpuUtilizationLimit110PercentExceededCounter?.Add(1);
_cpuUtilizationLimit110PercentExceeded++;
Log.CounterMessage110(_logger, _cpuUtilizationLimit110PercentExceeded);
_logger.CounterMessage110(_cpuUtilizationLimit110PercentExceeded);
}

return utilization;
Expand Down Expand Up @@ -228,7 +228,7 @@ public double CpuUtilization()
{
double percentage = Math.Min(One, (double)deltaCgroup / deltaHost);

Log.CpuUsageData(_logger, cgroupCpuTime, hostCpuTime, _previousCgroupCpuTime, _previousHostCpuTime, percentage);
_logger.CpuUsageData(cgroupCpuTime, hostCpuTime, _previousCgroupCpuTime, _previousHostCpuTime, percentage);

_cpuPercentage = percentage;
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
Expand Down Expand Up @@ -266,7 +266,7 @@ public double MemoryUtilization()
}
}

Log.MemoryUsageData(_logger, memoryUsed, _memoryLimit, _memoryPercentage);
_logger.MemoryUsageData(memoryUsed, _memoryLimit, _memoryPercentage);

return _memoryPercentage;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ internal static partial class Log
"Computed CPU usage with CgroupCpuTime = {cgroupCpuTime}, HostCpuTime = {hostCpuTime}, PreviousCgroupCpuTime = {previousCgroupCpuTime}, PreviousHostCpuTime = {previousHostCpuTime}, CpuPercentage = {cpuPercentage}.")]
#pragma warning restore S103 // Lines should not be too long
public static partial void CpuUsageData(
ILogger logger,
this ILogger logger,
long cgroupCpuTime,
long hostCpuTime,
long previousCgroupCpuTime,
Expand All @@ -25,21 +25,26 @@ public static partial void CpuUsageData(
[LoggerMessage(2, LogLevel.Debug,
"Computed memory usage with MemoryUsedInBytes = {memoryUsed}, MemoryLimit = {memoryLimit}, MemoryPercentage = {memoryPercentage}.")]
public static partial void MemoryUsageData(
ILogger logger,
this ILogger logger,
ulong memoryUsed,
double memoryLimit,
double memoryPercentage);

[LoggerMessage(3, LogLevel.Debug,
"System resources information: CpuLimit = {cpuLimit}, CpuRequest = {cpuRequest}, MemoryLimit = {memoryLimit}, MemoryRequest = {memoryRequest}.")]
public static partial void SystemResourcesInfo(ILogger logger, double cpuLimit, double cpuRequest, ulong memoryLimit, ulong memoryRequest);
public static partial void SystemResourcesInfo(
this ILogger logger,
double cpuLimit,
double cpuRequest,
ulong memoryLimit,
ulong memoryRequest);

[LoggerMessage(4, LogLevel.Debug,
#pragma warning disable S103 // Lines should not be too long
"For CgroupV2, Computed CPU usage with CgroupCpuTime = {cgroupCpuTime}, PreviousCgroupCpuTime = {previousCgroupCpuTime}, ActualElapsedNanoseconds = {actualElapsedNanoseconds}, CpuCores = {cpuCores}.")]
#pragma warning restore S103 // Lines should not be too long
public static partial void CpuUsageDataV2(
ILogger logger,
this ILogger logger,
long cgroupCpuTime,
long previousCgroupCpuTime,
double actualElapsedNanoseconds,
Expand All @@ -48,16 +53,18 @@ public static partial void CpuUsageDataV2(
[LoggerMessage(5, LogLevel.Debug,
"CPU utilization exceeded 100%: Counter = {counterValue}")]
public static partial void CounterMessage100(
ILogger logger,
this ILogger logger,
long counterValue);

[LoggerMessage(6, LogLevel.Debug,
"CPU utilization exceeded 110%: Counter = {counterValue}")]
public static partial void CounterMessage110(
ILogger logger,
this ILogger logger,
long counterValue);

[LoggerMessage(7, LogLevel.Warning,
"Error while getting disk stats: Error={errorMessage}")]
public static partial void HandleDiskStatsException(ILogger logger, string errorMessage);
public static partial void HandleDiskStatsException(
this ILogger logger,
string errorMessage);
}
Loading
Loading