File: Telemetry\CrashTelemetryRecorder.cs
Web Access
Project: ..\..\..\src\Framework\Microsoft.Build.Framework.csproj (Microsoft.Build.Framework)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.Runtime.CompilerServices;
#if NETFRAMEWORK
using Microsoft.VisualStudio.Telemetry;
#endif
 
namespace Microsoft.Build.Framework.Telemetry;
 
/// <summary>
/// Centralized helper for recording and flushing crash/failure telemetry.
/// All methods are best-effort and will never throw.
/// </summary>
internal static class CrashTelemetryRecorder
{
    /// <summary>
    /// Interval in milliseconds between EndBuild hang diagnostic emissions.
    /// When EndBuild is stuck waiting for submissions or nodes, diagnostics are emitted at this interval.
    /// </summary>
    public const int EndBuildHangDiagnosticsIntervalMs = 30_000;
 
    /// <summary>
    /// Records crash telemetry data for later emission via <see cref="FlushCrashTelemetry"/>.
    /// </summary>
    /// <param name="exception">The exception that caused the crash.</param>
    /// <param name="exitType">Exit type classification.</param>
    /// <param name="isUnhandled">True if the exception was not caught by any catch block.</param>
    /// <param name="isCritical">Whether the exception is classified as critical (OOM, StackOverflow, etc.).</param>
    /// <param name="buildEngineVersion">MSBuild version string, if available.</param>
    /// <param name="buildEngineFrameworkName">Framework name, if available.</param>
    /// <param name="buildEngineHost">Host name (VS, VSCode, CLI, etc.), if available.</param>
    /// <param name="isStandaloneExecution">True if MSBuild runs from command line, false if hosted.</param>
    /// <param name="maxNodeCount">Maximum number of build nodes configured.</param>
    /// <param name="activeNodeCount">Number of currently active build nodes at crash time.</param>
    /// <param name="submissionCount">Number of active build submissions at crash time.</param>
    public static void RecordCrashTelemetry(
        Exception exception,
        CrashExitType exitType,
        bool isUnhandled,
        bool isCritical,
        string? buildEngineVersion = null,
        string? buildEngineFrameworkName = null,
        string? buildEngineHost = null,
        bool? isStandaloneExecution = null,
        int? maxNodeCount = null,
        int? activeNodeCount = null,
        int? submissionCount = null)
    {
        try
        {
            CrashTelemetry crashTelemetry = CreateCrashTelemetry(exception, exitType, isUnhandled, isCritical);
            crashTelemetry.BuildEngineVersion = buildEngineVersion;
            crashTelemetry.BuildEngineFrameworkName = buildEngineFrameworkName;
            crashTelemetry.BuildEngineHost = buildEngineHost;
            crashTelemetry.IsStandaloneExecution = isStandaloneExecution;
            crashTelemetry.MaxNodeCount = maxNodeCount;
            crashTelemetry.ActiveNodeCount = activeNodeCount;
            crashTelemetry.SubmissionCount = submissionCount;
            KnownTelemetry.CrashTelemetry = crashTelemetry;
        }
        catch
        {
            // Best effort: telemetry must never cause a secondary failure.
        }
    }
 
    /// <summary>
    /// Records crash telemetry and immediately flushes it.
    /// Use when the process is about to terminate (e.g. unhandled exception handler).
    /// </summary>
    [MethodImpl(MethodImplOptions.NoInlining)]
    public static void RecordAndFlushCrashTelemetry(
        Exception exception,
        CrashExitType exitType,
        bool isUnhandled,
        bool isCritical)
    {
        try
        {
            CrashTelemetry crashTelemetry = CreateCrashTelemetry(exception, exitType, isUnhandled, isCritical);
 
            // Initialize here because the process is about to die — this may be
            // the only chance to set up telemetry (e.g., crash before Main() init,
            // or in a task AppDomain with separate static state).
            TelemetryManager.Instance?.Initialize(isStandalone: false);
 
            using IActivity? activity = TelemetryManager.Instance
                ?.DefaultActivitySource
                ?.StartActivity(TelemetryConstants.Crash);
            activity?.SetTags(crashTelemetry);
 
            PostFaultEvent(crashTelemetry);
        }
        catch
        {
            // Best effort: telemetry must never cause a secondary failure.
        }
    }
 
    /// <summary>
    /// Flushes any pending crash telemetry via the telemetry manager.
    /// Requires that TelemetryManager has already been initialized by the caller.
    /// </summary>
    [MethodImpl(MethodImplOptions.NoInlining)]
    public static void FlushCrashTelemetry()
    {
        try
        {
            CrashTelemetry? crashTelemetry = KnownTelemetry.CrashTelemetry;
            if (crashTelemetry is null)
            {
                return;
            }
 
            KnownTelemetry.CrashTelemetry = null;
 
            // Do not call TelemetryManager.Initialize here — the caller (Main or BuildManager)
            // is responsible for initialization. Calling Initialize from here would create a
            // VS telemetry session when tests call MSBuildApp.Execute() in-process, causing
            // environment variable side effects.
            using IActivity? activity = TelemetryManager.Instance
                ?.DefaultActivitySource
                ?.StartActivity(TelemetryConstants.Crash);
            activity?.SetTags(crashTelemetry);
 
            PostFaultEvent(crashTelemetry);
        }
        catch
        {
            // Best effort: telemetry must never cause a secondary failure.
        }
    }
 
    /// <summary>
    /// Posts a <c>FaultEvent</c> to the VS telemetry session so that crashes
    /// appear in Prism fault dashboards alongside other VS component faults.
    /// Only available on .NET Framework where the VS Telemetry SDK is loaded.
    /// See https://dev.azure.com/devdiv/DevDiv/_wiki/wikis/DevDiv.wiki/1022/FaultEvent-instrumentation-guide
    /// </summary>
#if NETFRAMEWORK
    [MethodImpl(MethodImplOptions.NoInlining)]
#endif
    private static void PostFaultEvent(CrashTelemetry crashTelemetry)
    {
#if NETFRAMEWORK
        try
        {
            if (crashTelemetry.Exception is null || TelemetryManager.IsOptOut())
            {
                return;
            }
 
            string eventName = $"{TelemetryConstants.EventPrefix}{TelemetryConstants.Crash}";
            string description = $"{crashTelemetry.ExitType}: {crashTelemetry.ExceptionType}";
            var sanitizedException = new SanitizedException(crashTelemetry.Exception!);
            var faultEvent = new FaultEvent(eventName, description, sanitizedException);
 
            faultEvent.Properties[$"{TelemetryConstants.PropertyPrefix}ExitType"] = crashTelemetry.ExitType.ToString();
            faultEvent.Properties[$"{TelemetryConstants.PropertyPrefix}CrashOrigin"] = crashTelemetry.CrashOrigin.ToString();
 
            if (crashTelemetry.CrashOriginNamespace is not null)
            {
                faultEvent.Properties[$"{TelemetryConstants.PropertyPrefix}CrashOriginNamespace"] = crashTelemetry.CrashOriginNamespace;
            }
 
            if (crashTelemetry.StackHash is not null)
            {
                faultEvent.Properties[$"{TelemetryConstants.PropertyPrefix}StackHash"] = crashTelemetry.StackHash;
            }
 
            TelemetryService.DefaultSession?.PostEvent(faultEvent);
        }
        catch
        {
            // Best effort: fault telemetry must never cause a secondary failure.
        }
#endif
    }
 
    /// <summary>
    /// Exception wrapper that sanitizes message and stack trace to remove PII
    /// before being passed to VS Telemetry's <c>FaultEvent</c>.
    /// </summary>
    internal sealed class SanitizedException : Exception
    {
        private readonly string? _sanitizedStackTrace;
 
        public SanitizedException(Exception original)
            : base(CrashTelemetry.TruncateMessage(original.Message) ?? original.GetType().FullName,
                   original.InnerException is not null ? new SanitizedException(original.InnerException) : null)
        {
            _sanitizedStackTrace = original.StackTrace is not null
                ? CrashTelemetry.SanitizeFilePathsInText(original.StackTrace)
                : null;
        }
 
        public override string? StackTrace => _sanitizedStackTrace;
    }
 
    private static CrashTelemetry CreateCrashTelemetry(
        Exception exception,
        CrashExitType exitType,
        bool isUnhandled,
        bool isCritical)
    {
        CrashTelemetry crashTelemetry = new();
        crashTelemetry.PopulateFromException(exception);
        crashTelemetry.ExitType = exitType;
        crashTelemetry.IsCritical = isCritical;
        crashTelemetry.IsUnhandled = isUnhandled;
        return crashTelemetry;
    }
 
    /// <summary>
    /// Emits a pre-populated <see cref="CrashTelemetry"/> for an EndBuild hang.
    /// Called periodically from timed wait loops so that diagnostics are available
    /// even if the hang never resolves (crash telemetry in the finally block would be unreachable).
    /// The caller is responsible for populating the <see cref="CrashTelemetry"/> with
    /// all relevant hang diagnostic data.
    /// </summary>
    [MethodImpl(MethodImplOptions.NoInlining)]
    public static void EmitEndBuildHangDiagnostics(CrashTelemetry crashTelemetry)
    {
        try
        {
            TelemetryManager.Instance?.Initialize(crashTelemetry.IsStandaloneExecution ?? false);
 
            using IActivity? activity = TelemetryManager.Instance
                ?.DefaultActivitySource
                ?.StartActivity(TelemetryConstants.Crash);
            activity?.SetTags(crashTelemetry);
        }
        catch
        {
            // Best effort: diagnostic telemetry must never cause a secondary failure.
        }
    }
}