File: Linux\LinuxUtilizationProvider.cs
Web Access
Project: src\src\Libraries\Microsoft.Extensions.Diagnostics.ResourceMonitoring\Microsoft.Extensions.Diagnostics.ResourceMonitoring.csproj (Microsoft.Extensions.Diagnostics.ResourceMonitoring)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.Collections.Generic;
using System.Diagnostics.Metrics;
using System.Threading;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using Microsoft.Shared.Instruments;
 
namespace Microsoft.Extensions.Diagnostics.ResourceMonitoring.Linux;
 
internal sealed class LinuxUtilizationProvider : ISnapshotProvider
{
    private const double One = 1.0;
    private const long Hundred = 100L;
    private const double NanosecondsInSecond = 1_000_000_000;
 
    private readonly object _cpuLocker = new();
    private readonly object _memoryLocker = new();
    private readonly ILogger<LinuxUtilizationProvider> _logger;
    private readonly ILinuxUtilizationParser _parser;
    private readonly ulong _memoryLimit;
    private readonly long _cpuPeriodsInterval;
    private readonly TimeSpan _cpuRefreshInterval;
    private readonly TimeSpan _memoryRefreshInterval;
    private readonly TimeProvider _timeProvider;
    private readonly double _scaleRelativeToCpuRequestForTrackerApi;
 
    private readonly TimeSpan _retryInterval = TimeSpan.FromMinutes(5);
    private DateTimeOffset _lastFailure = DateTimeOffset.MinValue;
    private int _measurementsUnavailable;
 
    private DateTimeOffset _refreshAfterCpu;
    private DateTimeOffset _refreshAfterMemory;
    private double _cpuPercentage = double.NaN;
    private double _lastCpuCoresUsed = double.NaN;
    private double _memoryPercentage;
    private long _previousCgroupCpuTime;
    private long _previousHostCpuTime;
    private long _previousCgroupCpuPeriodCounter;
    public SystemResources Resources { get; }
 
    public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILinuxUtilizationParser parser,
        IMeterFactory meterFactory, ILogger<LinuxUtilizationProvider>? logger = null, TimeProvider? timeProvider = null)
    {
        _parser = parser;
        _logger = logger ?? NullLogger<LinuxUtilizationProvider>.Instance;
        _timeProvider = timeProvider ?? TimeProvider.System;
        DateTimeOffset now = _timeProvider.GetUtcNow();
        _cpuRefreshInterval = options.Value.CpuConsumptionRefreshInterval;
        _memoryRefreshInterval = options.Value.MemoryConsumptionRefreshInterval;
        _refreshAfterCpu = now;
        _refreshAfterMemory = now;
        _memoryLimit = _parser.GetAvailableMemoryInBytes();
        _previousHostCpuTime = _parser.GetHostCpuUsageInNanoseconds();
        _previousCgroupCpuTime = _parser.GetCgroupCpuUsageInNanoseconds();
 
        float hostCpus = _parser.GetHostCpuCount();
        float cpuLimit = _parser.GetCgroupLimitedCpus();
        float cpuRequest = _parser.GetCgroupRequestCpu();
        float scaleRelativeToCpuLimit = hostCpus / cpuLimit;
        float scaleRelativeToCpuRequest = hostCpus / cpuRequest;
        _scaleRelativeToCpuRequestForTrackerApi = hostCpus; // the division by cpuRequest is performed later on in the ResourceUtilization class
 
#pragma warning disable CA2000 // Dispose objects before losing scope
        // We don't dispose the meter because IMeterFactory handles that
        // An issue on analyzer side: https://github.com/dotnet/roslyn-analyzers/issues/6912
        // Related documentation: https://github.com/dotnet/docs/pull/37170
        var meter = meterFactory.Create(ResourceUtilizationInstruments.MeterName);
#pragma warning restore CA2000 // Dispose objects before losing scope
 
        if (options.Value.UseLinuxCalculationV2)
        {
            cpuLimit = _parser.GetCgroupLimitV2();
            cpuRequest = _parser.GetCgroupRequestCpuV2();
 
            // Get Cpu periods interval from cgroup
            _cpuPeriodsInterval = _parser.GetCgroupPeriodsIntervalInMicroSecondsV2();
            (_previousCgroupCpuTime, _previousCgroupCpuPeriodCounter) = _parser.GetCgroupCpuUsageInNanosecondsAndCpuPeriodsV2();
 
            _ = meter.CreateObservableGauge(
                name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
                observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
                unit: "1");
 
            _ = meter.CreateObservableGauge(
                name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
                observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationRequest(cpuRequest)),
                unit: "1");
 
            _ = meter.CreateObservableGauge(
                name: ResourceUtilizationInstruments.ContainerCpuTime,
                observeValues: GetCpuTime,
                unit: "1");
        }
        else
        {
            _ = meter.CreateObservableGauge(
                name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
                observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * scaleRelativeToCpuLimit),
                unit: "1");
 
            _ = meter.CreateObservableGauge(
                name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
                observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * scaleRelativeToCpuRequest),
                unit: "1");
 
            _ = meter.CreateObservableGauge(
                name: ResourceUtilizationInstruments.ProcessCpuUtilization,
                observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * scaleRelativeToCpuRequest),
                unit: "1");
        }
 
        _ = meter.CreateObservableGauge(
            name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization,
            observeValues: () => GetMeasurementWithRetry(MemoryUtilization),
            unit: "1");
 
        _ = meter.CreateObservableGauge(
            name: ResourceUtilizationInstruments.ProcessMemoryUtilization,
            observeValues: () => GetMeasurementWithRetry(MemoryUtilization),
            unit: "1");
 
        // cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
        // cpuLimit is a CPU limit (aka max CPU units available) for a pod or for a host.
        // _memoryLimit - Resource Memory Limit (in k8s terms)
        // _memoryLimit - To keep the contract, this parameter will get the Host available memory
        Resources = new SystemResources(cpuRequest, cpuLimit, _memoryLimit, _memoryLimit);
        _logger.SystemResourcesInfo(cpuLimit, cpuRequest, _memoryLimit, _memoryLimit);
    }
 
    public double CpuUtilizationV2()
    {
        DateTimeOffset now = _timeProvider.GetUtcNow();
        lock (_cpuLocker)
        {
            if (now < _refreshAfterCpu)
            {
                return _lastCpuCoresUsed;
            }
        }
 
        (long cpuUsageTime, long cpuPeriodCounter) = _parser.GetCgroupCpuUsageInNanosecondsAndCpuPeriodsV2();
        lock (_cpuLocker)
        {
            if (now < _refreshAfterCpu)
            {
                return _lastCpuCoresUsed;
            }
 
            long deltaCgroup = cpuUsageTime - _previousCgroupCpuTime;
            long deltaPeriodCount = cpuPeriodCounter - _previousCgroupCpuPeriodCounter;
 
            if (deltaCgroup <= 0 || deltaPeriodCount <= 0)
            {
                return _lastCpuCoresUsed;
            }
 
            long deltaCpuPeriodInNanoseconds = deltaPeriodCount * _cpuPeriodsInterval * 1000;
            double coresUsed = deltaCgroup / (double)deltaCpuPeriodInNanoseconds;
 
            _logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, deltaCpuPeriodInNanoseconds, coresUsed);
 
            _lastCpuCoresUsed = coresUsed;
            _refreshAfterCpu = now.Add(_cpuRefreshInterval);
            _previousCgroupCpuTime = cpuUsageTime;
            _previousCgroupCpuPeriodCounter = cpuPeriodCounter;
        }
 
        return _lastCpuCoresUsed;
    }
 
    public double CpuUtilization()
    {
        DateTimeOffset now = _timeProvider.GetUtcNow();
 
        lock (_cpuLocker)
        {
            if (now < _refreshAfterCpu)
            {
                return _cpuPercentage;
            }
        }
 
        long hostCpuTime = _parser.GetHostCpuUsageInNanoseconds();
        long cgroupCpuTime = _parser.GetCgroupCpuUsageInNanoseconds();
 
        lock (_cpuLocker)
        {
            if (now < _refreshAfterCpu)
            {
                return _cpuPercentage;
            }
 
            long deltaHost = hostCpuTime - _previousHostCpuTime;
            long deltaCgroup = cgroupCpuTime - _previousCgroupCpuTime;
 
            if (deltaHost <= 0 || deltaCgroup <= 0)
            {
                return _cpuPercentage;
            }
 
            double percentage = Math.Min(One, (double)deltaCgroup / deltaHost);
 
            _logger.CpuUsageData(cgroupCpuTime, hostCpuTime, _previousCgroupCpuTime, _previousHostCpuTime, percentage);
 
            _cpuPercentage = percentage;
            _refreshAfterCpu = now.Add(_cpuRefreshInterval);
            _previousCgroupCpuTime = cgroupCpuTime;
            _previousHostCpuTime = hostCpuTime;
        }
 
        return _cpuPercentage;
    }
 
    public double MemoryUtilization()
    {
        DateTimeOffset now = _timeProvider.GetUtcNow();
 
        lock (_memoryLocker)
        {
            if (now < _refreshAfterMemory)
            {
                return _memoryPercentage;
            }
        }
 
        ulong memoryUsed = _parser.GetMemoryUsageInBytes();
 
        lock (_memoryLocker)
        {
            if (now >= _refreshAfterMemory)
            {
                double memoryPercentage = Math.Min(One, (double)memoryUsed / _memoryLimit);
 
                _memoryPercentage = memoryPercentage;
                _refreshAfterMemory = now.Add(_memoryRefreshInterval);
            }
        }
 
        _logger.MemoryUsageData(memoryUsed, _memoryLimit, _memoryPercentage);
 
        return _memoryPercentage;
    }
 
    /// <remarks>
    /// Not adding caching, to preserve original semantics of the code.
    /// The snapshot provider is called in intervals configured by the tracker.
    /// We multiply by scale to make hardcoded algorithm in tracker's calculator to produce right results.
    /// </remarks>
    public Snapshot GetSnapshot()
    {
        long hostTime = _parser.GetHostCpuUsageInNanoseconds();
        long cgroupTime = _parser.GetCgroupCpuUsageInNanoseconds();
        ulong memoryUsed = _parser.GetMemoryUsageInBytes();
 
        return new Snapshot(
            totalTimeSinceStart: TimeSpan.FromTicks(hostTime / Hundred),
            kernelTimeSinceStart: TimeSpan.Zero,
            userTimeSinceStart: TimeSpan.FromTicks((long)(cgroupTime / Hundred * _scaleRelativeToCpuRequestForTrackerApi)),
            memoryUsageInBytes: memoryUsed);
    }
 
    private Measurement<double>[] GetMeasurementWithRetry(Func<double> func)
    {
        if (!TryGetValueWithRetry(func, out double value))
        {
            return Array.Empty<Measurement<double>>();
        }
 
        return new[] { new Measurement<double>(value) };
    }
 
    private bool TryGetValueWithRetry<T>(Func<T> func, out T value)
        where T : struct
    {
        value = default;
        if (Volatile.Read(ref _measurementsUnavailable) == 1 &&
            _timeProvider.GetUtcNow() - _lastFailure < _retryInterval)
        {
            return false;
        }
 
        try
        {
            value = func();
            _ = Interlocked.CompareExchange(ref _measurementsUnavailable, 0, 1);
 
            return true;
        }
        catch (Exception ex) when (
            ex is System.IO.FileNotFoundException ||
            ex is System.IO.DirectoryNotFoundException ||
            ex is System.UnauthorizedAccessException)
        {
            _lastFailure = _timeProvider.GetUtcNow();
            _ = Interlocked.Exchange(ref _measurementsUnavailable, 1);
 
            return false;
        }
    }
 
    // Math.Min() is used below to mitigate margin errors and various kinds of precisions losses
    // due to the fact that the calculation itself is not an atomic operation:
    private double CpuUtilizationRequest(double cpuRequest) => Math.Min(One, CpuUtilizationV2() / cpuRequest);
    private double CpuUtilizationLimit(double cpuLimit) => Math.Min(One, CpuUtilizationV2() / cpuLimit);
 
    private IEnumerable<Measurement<double>> GetCpuTime()
    {
        if (TryGetValueWithRetry(_parser.GetHostCpuUsageInNanoseconds, out long systemCpuTime))
        {
            yield return new Measurement<double>(systemCpuTime / NanosecondsInSecond, [new KeyValuePair<string, object?>("cpu.mode", "system")]);
        }
 
        if (TryGetValueWithRetry(CpuUtilizationV2, out double userCpuTime))
        {
            yield return new Measurement<double>(userCpuTime, [new KeyValuePair<string, object?>("cpu.mode", "user")]);
        }
    }
}