File: ResultsTests.cs
Web Access
Project: src\test\Libraries\Microsoft.Extensions.AI.Evaluation.Integration.Tests\Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj (Microsoft.Extensions.AI.Evaluation.Integration.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. Consider declaring as nullable.
 
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.Extensions.AI.Evaluation;
using Microsoft.Extensions.AI.Evaluation.Reporting;
using Microsoft.Extensions.AI.Evaluation.Reporting.Storage;
using Xunit;
 
namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests;
 
public class ResultsTests
{
    private static readonly ChatMessage _testResponse = "Test response".ToAssistantMessage();
 
    public static ReportingConfiguration CreateReportingConfiguration(IEvaluator evaluator) =>
        DiskBasedReportingConfiguration.Create(
            storageRootPath: Settings.Current.Configured ?
                Settings.Current.StorageRootPath :
                Path.Combine(Path.GetTempPath(), Path.GetRandomFileName()),
            evaluators: [evaluator],
            chatConfiguration: null, // Not needed for this test
            executionName: Constants.Version);
 
    #region Interpretations
    private static EvaluationMetricInterpretation? FailIfValueIsTrue(EvaluationMetric m)
    {
        const bool PassingValue = false;
 
        switch (m)
        {
            case BooleanMetric booleanMetric:
                if (booleanMetric.Value is bool value)
                {
                    return value is PassingValue
                        ? new EvaluationMetricInterpretation(rating: EvaluationRating.Exceptional)
                        : new EvaluationMetricInterpretation(
                            rating: EvaluationRating.Unacceptable,
                            failed: true,
                            reason: $"Value is {value}.");
                }
 
                break;
 
            default:
                throw new InvalidOperationException();
        }
 
        return null;
    }
 
    private static EvaluationMetricInterpretation? FailIfValueIsLessThan4(EvaluationMetric m)
    {
        const double MinimumPassingScore = 4.0;
 
        switch (m)
        {
            case NumericMetric numericMetric:
                if (numericMetric.Value is double value)
                {
                    EvaluationRating rating = value switch
                    {
                        > 5.0 => EvaluationRating.Inconclusive,
                        > 4.0 and <= 5.0 => EvaluationRating.Exceptional,
                        > 3.0 and <= 4.0 => EvaluationRating.Good,
                        > 2.0 and <= 3.0 => EvaluationRating.Average,
                        > 1.0 and <= 2.0 => EvaluationRating.Poor,
                        > 0.0 and <= 1.0 => EvaluationRating.Unacceptable,
                        <= 0.0 => EvaluationRating.Inconclusive,
                        _ => EvaluationRating.Inconclusive,
                    };
 
                    return
                        value < MinimumPassingScore &&
                        rating is not (EvaluationRating.Inconclusive or EvaluationRating.Unknown)
                            ? new EvaluationMetricInterpretation(
                                rating,
                                failed: true,
                                reason: $"Value is less than {MinimumPassingScore}.")
                            : new EvaluationMetricInterpretation(rating);
                }
 
                break;
 
            default:
                throw new InvalidOperationException();
        }
 
        return null;
    }
 
#pragma warning disable S1067 // Expressions should not be too complex.
    private static EvaluationMetricInterpretation? FailIfValueIsMissing(EvaluationMetric m) =>
        (m is NumericMetric s && s.Value is not null) ||
        (m is BooleanMetric b && b.Value is not null) ||
        (m is StringMetric e && e.Value is not null)
            ? new EvaluationMetricInterpretation(EvaluationRating.Good)
            : new EvaluationMetricInterpretation(EvaluationRating.Unacceptable, failed: true, "Value is missing");
#pragma warning restore S1067
 
    private enum MeasurementSystem
    {
        None,
        Unknown,
        Metric,
        Imperial,
        USCustomary,
        Nautical,
        Astronomical,
        Multiple
    }
 
    private static EvaluationMetricInterpretation? FailIfNotImperialOrUSCustomary(EvaluationMetric m)
    {
        if (m is not StringMetric e)
        {
            return null;
        }
 
        if (e.Value is null)
        {
            return new EvaluationMetricInterpretation(EvaluationRating.Unknown, failed: true, "Value is missing");
        }
 
        if (!Enum.TryParse(e.Value, out MeasurementSystem measurementSystem))
        {
            return new EvaluationMetricInterpretation(EvaluationRating.Inconclusive, failed: true, $"Value {e.Value} is not an allowed value");
        }
 
        if (measurementSystem is MeasurementSystem.USCustomary or MeasurementSystem.Imperial)
        {
            return new EvaluationMetricInterpretation(EvaluationRating.Exceptional, reason: $"Value is {e.Value}");
        }
 
        return new EvaluationMetricInterpretation(EvaluationRating.Unacceptable, failed: true, reason: $"Value is {e.Value}");
    }
    #endregion
 
    [Fact]
    public async Task ResultWithBooleanMetric()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metricA = new BooleanMetric("Metric with value false", false);
        var metricB = new BooleanMetric("Metric with value true", true);
        var metricC = new BooleanMetric("Metric without value");
        evaluator.TestMetrics = [metricA, metricB, metricC];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithBooleanMetric)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
 
        Assert.True(result.Metrics.Values.All(m => m.Interpretation is null));
        Assert.Null(metricA.Interpretation);
        Assert.Null(metricB.Interpretation);
        Assert.Null(metricC.Interpretation);
 
        Assert.False(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithBooleanMetricAndInterpretation()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metricA = new BooleanMetric("Metric with value false", false);
        var metricB = new BooleanMetric("Metric with value true", true);
        var metricC = new BooleanMetric("Metric without value");
        evaluator.TestMetrics = [metricA, metricB, metricC];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithBooleanMetricAndInterpretation)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
        result.Interpret(FailIfValueIsTrue);
 
        Assert.NotNull(metricA.Interpretation);
        Assert.False(metricA.Interpretation!.Failed);
        Assert.NotNull(metricB.Interpretation);
        Assert.True(metricB.Interpretation!.Failed);
        Assert.Null(metricC.Interpretation);
 
        Assert.False(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithStringMetric()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var allowedValues =
            new HashSet<string>
            {
                "None",
                "Unknown",
                "Metric",
                "Imperial",
                "USCustomary",
                "Nautical",
                "Astronomical",
                "Multiple"
            };
 
        var metricA = new StringMetric("Measurement System: None", "None");
        var metricB = new StringMetric("Measurement System: Unknown", "Unknown");
        var metricC = new StringMetric("Measurement System: Metric", "Metric");
        var metricD = new StringMetric("Measurement System: Imperial", "Imperial");
        var metricE = new StringMetric("Measurement System: USCustomary", "UsCustomary");
        var metricF = new StringMetric("Measurement System: Nautical", "Nautical");
        var metricG = new StringMetric("Measurement System: Astronomical", "Astronomical");
        var metricH = new StringMetric("Measurement System: Multiple", "Multiple");
        var metricI = new StringMetric("Measurement System: Blah", "Blah");
        var metricJ = new StringMetric("Measurement System: Empty", "");
        var metricK = new StringMetric("Measurement System: Null");
 
        evaluator.TestMetrics =
            [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH, metricI, metricJ, metricK];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithStringMetric)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
 
        Assert.Null(metricA.Interpretation);
        Assert.Null(metricB.Interpretation);
        Assert.Null(metricC.Interpretation);
        Assert.Null(metricD.Interpretation);
        Assert.Null(metricE.Interpretation);
        Assert.Null(metricF.Interpretation);
        Assert.Null(metricG.Interpretation);
        Assert.Null(metricH.Interpretation);
        Assert.Null(metricI.Interpretation);
        Assert.Null(metricJ.Interpretation);
        Assert.Null(metricK.Interpretation);
 
        Assert.False(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithStringMetricAndInterpretation()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var allowedValues =
            new HashSet<string>
            {
                "None",
                "Unknown",
                "Metric",
                "Imperial",
                "USCustomary",
                "Nautical",
                "Astronomical",
                "Multiple"
            };
 
        var metricA = new StringMetric("Measurement System: None", "None");
        var metricB = new StringMetric("Measurement System: Unknown", "Unknown");
        var metricC = new StringMetric("Measurement System: Metric", "Metric");
        var metricD = new StringMetric("Measurement System: Imperial", "Imperial");
        var metricE = new StringMetric("Measurement System: USCustomary", "USCustomary");
        var metricF = new StringMetric("Measurement System: Nautical", "Nautical");
        var metricG = new StringMetric("Measurement System: Astronomical", "Astronomical");
        var metricH = new StringMetric("Measurement System: Multiple", "Multiple");
        var metricI = new StringMetric("Measurement System: Blah", "Blah");
        var metricJ = new StringMetric("Measurement System: Empty", "");
        var metricK = new StringMetric("Measurement System: Null");
 
        evaluator.TestMetrics =
            [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH, metricI, metricJ, metricK];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithStringMetricAndInterpretation)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
        result.Interpret(FailIfNotImperialOrUSCustomary);
 
        Assert.NotNull(metricA.Interpretation);
        Assert.True(metricA.Interpretation!.Failed);
        Assert.NotNull(metricB.Interpretation);
        Assert.True(metricB.Interpretation!.Failed);
        Assert.NotNull(metricC.Interpretation);
        Assert.True(metricC.Interpretation!.Failed);
        Assert.NotNull(metricD.Interpretation);
        Assert.False(metricD.Interpretation!.Failed);
        Assert.NotNull(metricE.Interpretation);
        Assert.False(metricE.Interpretation!.Failed);
        Assert.NotNull(metricF.Interpretation);
        Assert.True(metricF.Interpretation!.Failed);
        Assert.NotNull(metricG.Interpretation);
        Assert.True(metricG.Interpretation!.Failed);
        Assert.NotNull(metricH.Interpretation);
        Assert.True(metricH.Interpretation!.Failed);
        Assert.NotNull(metricI.Interpretation);
        Assert.True(metricI.Interpretation!.Failed);
        Assert.NotNull(metricJ.Interpretation);
        Assert.True(metricJ.Interpretation!.Failed);
        Assert.NotNull(metricK.Interpretation);
        Assert.True(metricK.Interpretation!.Failed);
 
        Assert.False(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithNumericMetrics()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metricA = new NumericMetric("Metric with value 0", 0);
        var metricB = new NumericMetric("Metric with value 1", 1);
        var metricC = new NumericMetric("Metric with value 2", 2);
        var metricD = new NumericMetric("Metric with value 3", 3);
        var metricE = new NumericMetric("Metric with value 4", 4);
        var metricF = new NumericMetric("Metric with value 5", 5);
        var metricG = new NumericMetric("Metric with value 6", 6);
        var metricH = new NumericMetric("Metric with no value");
        evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithNumericMetrics)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
 
        Assert.True(result.Metrics.Values.All(m => m.Interpretation is null));
        Assert.Null(metricA.Interpretation);
        Assert.Null(metricB.Interpretation);
        Assert.Null(metricC.Interpretation);
        Assert.Null(metricD.Interpretation);
        Assert.Null(metricE.Interpretation);
        Assert.Null(metricF.Interpretation);
        Assert.Null(metricG.Interpretation);
        Assert.Null(metricH.Interpretation);
 
        Assert.False(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithNumericMetricsAndInterpretation()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metricA = new NumericMetric("Metric with value 0", 0);
        var metricB = new NumericMetric("Metric with value 1", 1);
        var metricC = new NumericMetric("Metric with value 2", 2);
        var metricD = new NumericMetric("Metric with value 3", 3);
        var metricE = new NumericMetric("Metric with value 4", 4);
        var metricF = new NumericMetric("Metric with value 5", 5);
        var metricG = new NumericMetric("Metric with value 6", 6);
        var metricH = new NumericMetric("Metric with no value");
        evaluator.TestMetrics = [metricA, metricB, metricC, metricD, metricE, metricF, metricG, metricH];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithNumericMetricsAndInterpretation)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
        result.Interpret(FailIfValueIsLessThan4);
 
        Assert.NotNull(metricA.Interpretation);
        Assert.False(metricA.Interpretation!.Failed);
        Assert.NotNull(metricB.Interpretation);
        Assert.True(metricB.Interpretation!.Failed);
        Assert.NotNull(metricC.Interpretation);
        Assert.True(metricC.Interpretation!.Failed);
        Assert.NotNull(metricD.Interpretation);
        Assert.True(metricD.Interpretation!.Failed);
        Assert.NotNull(metricE.Interpretation);
        Assert.False(metricE.Interpretation!.Failed);
        Assert.NotNull(metricF.Interpretation);
        Assert.False(metricF.Interpretation!.Failed);
        Assert.NotNull(metricG.Interpretation);
        Assert.False(metricG.Interpretation!.Failed);
        Assert.Null(metricH.Interpretation);
 
        Assert.False(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithDiagnosticsOnUninterpretedMetrics()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metric1 = new BooleanMetric("Metric with all diagnostic severities");
        metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 2"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2"));
 
        var metric2 = new BooleanMetric("Metric with warning and informational diagnostics");
        metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2"));
        metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2"));
 
        var metric3 = new EvaluationMetric("Metric with error diagnostics only");
        metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1"));
        metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2"));
 
        HashSet<string> allowedValues = ["A", "B", "C"];
        var metric4 = new StringMetric("Metric with warning diagnostics only");
        metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2"));
 
        var metric5 = new NumericMetric("Metric with informational diagnostics only");
        metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1"));
 
        evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithDiagnosticsOnUninterpretedMetrics)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
 
        Assert.Null(metric1.Interpretation);
        Assert.Null(metric2.Interpretation);
        Assert.Null(metric3.Interpretation);
        Assert.Null(metric4.Interpretation);
        Assert.Null(metric5.Interpretation);
 
        Assert.True(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithDiagnosticsOnFailingMetrics()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metric1 = new BooleanMetric("Metric with all diagnostic severities");
        metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 2"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2"));
 
        var metric2 = new BooleanMetric("Metric with warning and informational diagnostics");
        metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2"));
        metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2"));
 
        var metric3 = new EvaluationMetric("Metric with error diagnostics only");
        metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1"));
        metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2"));
 
        HashSet<string> allowedValues = ["A", "B", "C"];
        var metric4 = new StringMetric("Metric with warning diagnostics only");
        metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2"));
 
        var metric5 = new NumericMetric("Metric with informational diagnostics only");
        metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1"));
 
        evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithDiagnosticsOnFailingMetrics)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
        result.Interpret(FailIfValueIsMissing);
 
        Assert.NotNull(metric1.Interpretation);
        Assert.True(metric1.Interpretation!.Failed);
        Assert.NotNull(metric2.Interpretation);
        Assert.True(metric2.Interpretation!.Failed);
        Assert.NotNull(metric3.Interpretation);
        Assert.True(metric3.Interpretation!.Failed);
        Assert.NotNull(metric4.Interpretation);
        Assert.True(metric4.Interpretation!.Failed);
        Assert.NotNull(metric5.Interpretation);
        Assert.True(metric5.Interpretation!.Failed);
 
        Assert.True(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithDiagnosticsOnPassingMetrics()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metric1 = new BooleanMetric("Metric with all diagnostic severities", value: true);
        metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 2"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1"));
        metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2"));
 
        var metric2 = new BooleanMetric("Metric with warning and informational diagnostics", value: true);
        metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2"));
        metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2"));
 
        var metric3 = new NumericMetric("Metric with error diagnostics only", value: 5);
        metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1"));
        metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2"));
 
        HashSet<string> allowedValues = ["A", "B", "C"];
        var metric4 = new StringMetric("Metric with warning diagnostics only", value: "A");
        metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1"));
        metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2"));
 
        var metric5 = new NumericMetric("Metric with informational diagnostics only", value: 4);
        metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1"));
 
        evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5];
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithDiagnosticsOnPassingMetrics)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
        result.Interpret(FailIfValueIsMissing);
 
        Assert.NotNull(metric1.Interpretation);
        Assert.False(metric1.Interpretation!.Failed);
        Assert.NotNull(metric2.Interpretation);
        Assert.False(metric2.Interpretation!.Failed);
        Assert.NotNull(metric3.Interpretation);
        Assert.False(metric3.Interpretation!.Failed);
        Assert.NotNull(metric4.Interpretation);
        Assert.False(metric4.Interpretation!.Failed);
        Assert.NotNull(metric5.Interpretation);
        Assert.False(metric5.Interpretation!.Failed);
 
        Assert.True(result.ContainsDiagnostics());
    }
 
    [Fact]
    public async Task ResultWithException()
    {
        var evaluator = new TestEvaluator();
        ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator);
 
        var metric = new BooleanMetric("Condition", true);
 
        evaluator.TestMetrics = [metric];
        evaluator.ThrowOnEvaluate = true;
 
        await using ScenarioRun scenarioRun =
            await reportingConfiguration.CreateScenarioRunAsync(
                $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(ResultsTests)}.{nameof(ResultWithException)}");
 
        EvaluationResult result = await scenarioRun.EvaluateAsync(_testResponse);
 
        Assert.True(result.ContainsDiagnostics(d => d.Severity == EvaluationDiagnosticSeverity.Error));
    }
}