|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;
namespace Microsoft.Extensions.AI.Evaluation.Quality;
/// <summary>
/// An <see langword="abstract"/> base class that can be used to implement an AI-based <see cref="IEvaluator"/>.
/// </summary>
public abstract class ChatConversationEvaluator : IEvaluator
{
/// <inheritdoc/>
public abstract IReadOnlyCollection<string> EvaluationMetricNames { get; }
/// <summary>
/// Gets a value indicating whether this <see cref="IEvaluator"/> considers the entire conversation history (in
/// addition to the request and response being evaluated) as part of the evaluation it performs.
/// </summary>
/// <value>
/// <see langword="true"/> if this <see cref="IEvaluator"/> considers the entire conversation history as part of
/// the evaluation it performs; <see langword="false"/> otherwise.
/// </value>
protected abstract bool IgnoresHistory { get; }
/// <summary>
/// Gets the system prompt that this <see cref="IEvaluator"/> uses when performing evaluations.
/// </summary>
protected virtual string? SystemPrompt => null;
/// <inheritdoc/>
public virtual async ValueTask<EvaluationResult> EvaluateAsync(
IEnumerable<ChatMessage> messages,
ChatResponse modelResponse,
ChatConfiguration? chatConfiguration = null,
IEnumerable<EvaluationContext>? additionalContext = null,
CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(modelResponse);
_ = Throw.IfNull(chatConfiguration);
EvaluationResult result = InitializeResult();
if (string.IsNullOrWhiteSpace(modelResponse.Text))
{
result.AddDiagnosticsToAllMetrics(
EvaluationDiagnostic.Error(
"Evaluation failed because the model response supplied for evaluation was null or empty."));
return result;
}
(ChatMessage? userRequest, List<ChatMessage> history) = GetUserRequestAndHistory(messages);
int inputTokenLimit = 0;
int ignoredMessagesCount = 0;
if (chatConfiguration.TokenCounter is not null)
{
IEvaluationTokenCounter tokenCounter = chatConfiguration.TokenCounter;
inputTokenLimit = tokenCounter.InputTokenLimit;
int tokenBudget = inputTokenLimit;
void OnTokenBudgetExceeded()
{
EvaluationDiagnostic tokenBudgetExceeded =
EvaluationDiagnostic.Error(
$"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded.");
result.AddDiagnosticsToAllMetrics(tokenBudgetExceeded);
}
if (!string.IsNullOrWhiteSpace(SystemPrompt))
{
tokenBudget -= tokenCounter.CountTokens(SystemPrompt!);
if (tokenBudget < 0)
{
OnTokenBudgetExceeded();
return result;
}
}
string baseEvaluationPrompt =
await RenderEvaluationPromptAsync(
userRequest,
modelResponse,
includedHistory: [],
additionalContext,
cancellationToken).ConfigureAwait(false);
tokenBudget -= tokenCounter.CountTokens(baseEvaluationPrompt);
if (tokenBudget < 0)
{
OnTokenBudgetExceeded();
return result;
}
if (history.Count > 0 && !IgnoresHistory)
{
if (history.Count == 1)
{
(bool canRender, tokenBudget) =
await CanRenderAsync(
history[0],
tokenBudget,
chatConfiguration,
cancellationToken).ConfigureAwait(false);
if (!canRender)
{
ignoredMessagesCount = 1;
history = [];
}
}
else
{
int totalMessagesCount = history.Count;
int includedMessagesCount = 0;
history.Reverse();
foreach (ChatMessage message in history)
{
cancellationToken.ThrowIfCancellationRequested();
(bool canRender, tokenBudget) =
await CanRenderAsync(
message,
tokenBudget,
chatConfiguration,
cancellationToken).ConfigureAwait(false);
if (!canRender)
{
ignoredMessagesCount = totalMessagesCount - includedMessagesCount;
history.RemoveRange(index: includedMessagesCount, count: ignoredMessagesCount);
break;
}
includedMessagesCount++;
}
history.Reverse();
}
}
}
var evaluationMessages = new List<ChatMessage>();
if (!string.IsNullOrWhiteSpace(SystemPrompt))
{
evaluationMessages.Add(new ChatMessage(ChatRole.System, SystemPrompt!));
}
string evaluationPrompt =
await RenderEvaluationPromptAsync(
userRequest,
modelResponse,
includedHistory: history,
additionalContext,
cancellationToken).ConfigureAwait(false);
evaluationMessages.Add(new ChatMessage(ChatRole.User, evaluationPrompt));
await PerformEvaluationAsync(
chatConfiguration,
evaluationMessages,
result,
cancellationToken).ConfigureAwait(false);
if (inputTokenLimit > 0 && ignoredMessagesCount > 0)
{
#pragma warning disable S103 // Lines should not be too long
result.AddDiagnosticsToAllMetrics(
EvaluationDiagnostic.Warning(
$"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens."));
#pragma warning restore S103
}
return result;
}
/// <summary>
/// Determines if there is sufficient <paramref name="tokenBudget"/> remaining to render the
/// supplied <paramref name="message"/> as part of the evaluation prompt that this <see cref="IEvaluator"/> uses.
/// </summary>
/// <param name="message">
/// A message that is part of the conversation history for the response being evaluated and that is to be rendered
/// as part of the evaluation prompt.
/// </param>
/// <param name="tokenBudget">
/// The number of tokens available for the rendering additional content as part of the evaluation prompt.
/// </param>
/// <param name="chatConfiguration">
/// A <see cref="ChatConfiguration"/> that specifies the <see cref="IChatClient"/> and the
/// <see cref="IEvaluationTokenCounter"/> that this <see cref="IEvaluator"/> uses to perform the evaluation.
/// </param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> that can cancel the operation.</param>
/// <returns>
/// A tuple containing a <see langword="bool"/> indicating whether there is sufficient
/// <paramref name="tokenBudget"/> remaining to render the supplied <paramref name="message"/> as part of the
/// evaluation prompt, and an <see langword="int"/> containing the remaining token budget that would be available
/// once this <paramref name="message"/> is rendered.
/// </returns>
protected virtual ValueTask<(bool canRender, int remainingTokenBudget)> CanRenderAsync(
ChatMessage message,
int tokenBudget,
ChatConfiguration chatConfiguration,
CancellationToken cancellationToken)
{
_ = Throw.IfNull(message);
_ = Throw.IfNull(chatConfiguration);
IEvaluationTokenCounter? tokenCounter = chatConfiguration.TokenCounter;
if (tokenCounter is null)
{
return new ValueTask<(bool, int)>((true, tokenBudget));
}
string? author = message.AuthorName;
string role = message.Role.Value;
string content = message.Text ?? string.Empty;
int tokenCount =
string.IsNullOrWhiteSpace(author)
? tokenCounter.CountTokens("[") +
tokenCounter.CountTokens(role) +
tokenCounter.CountTokens("] ") +
tokenCounter.CountTokens(content) +
tokenCounter.CountTokens("\n")
: tokenCounter.CountTokens("[") +
tokenCounter.CountTokens(author!) +
tokenCounter.CountTokens(" (") +
tokenCounter.CountTokens(role) +
tokenCounter.CountTokens(")] ") +
tokenCounter.CountTokens(content) +
tokenCounter.CountTokens("\n");
if (tokenCount > tokenBudget)
{
return new ValueTask<(bool, int)>((false, tokenBudget));
}
else
{
return new ValueTask<(bool, int)>((true, tokenBudget - tokenCount));
}
}
/// <summary>
/// Renders the supplied <paramref name="response"/> to a string that can be included as part of the evaluation
/// prompt that this <see cref="IEvaluator"/> uses.
/// </summary>
/// <param name="response">
/// Chat response being evaluated and that is to be rendered as part of the evaluation prompt.
/// </param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> that can cancel the operation.</param>
/// <returns>
/// A string representation of the supplied <paramref name="response"/> that can be included as part of the
/// evaluation prompt.
/// </returns>
/// <remarks>
/// The default implementation uses <see cref="RenderAsync(ChatMessage, CancellationToken)"/> to render
/// each message in the response.
/// </remarks>
protected virtual async ValueTask<string> RenderAsync(ChatResponse response, CancellationToken cancellationToken)
{
_ = Throw.IfNull(response);
StringBuilder sb = new();
foreach (ChatMessage message in response.Messages)
{
_ = sb.Append(await RenderAsync(message, cancellationToken).ConfigureAwait(false));
}
return sb.ToString();
}
/// <summary>
/// Renders the supplied <paramref name="message"/> to a string that can be included as part of the evaluation
/// prompt that this <see cref="IEvaluator"/> uses.
/// </summary>
/// <param name="message">
/// Message that is part of the conversation history for the response being evaluated and that is to be rendered
/// as part of the evaluation prompt.
/// </param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> that can cancel the operation.</param>
/// <returns>
/// A string representation of the supplied <paramref name="message"/> that can be included as part of the
/// evaluation prompt.
/// </returns>
protected virtual ValueTask<string> RenderAsync(ChatMessage message, CancellationToken cancellationToken)
{
_ = Throw.IfNull(message);
string? author = message.AuthorName;
string role = message.Role.Value;
string? content = message.Text;
return string.IsNullOrWhiteSpace(author)
? new ValueTask<string>($"[{role}] {content}\n")
: new ValueTask<string>($"[{author} ({role})] {content}\n");
}
/// <summary>
/// Renders the information present in the supplied parameters into a prompt that this <see cref="IEvaluator"/>
/// uses to perform the evaluation.
/// </summary>
/// <param name="userRequest">
/// The request that produced the <paramref name="modelResponse"/> that is to be evaluated.
/// </param>
/// <param name="modelResponse">The response that is to be evaluated.</param>
/// <param name="includedHistory">
/// The conversation history (excluding the <paramref name="userRequest"/> and <paramref name="modelResponse"/>)
/// that is to be included as part of the evaluation prompt.
/// </param>
/// <param name="additionalContext">
/// Additional contextual information (beyond that which is available in the <paramref name="userRequest"/> and
/// <paramref name="includedHistory"/>) that this <see cref="IEvaluator"/> may need to accurately evaluate the
/// supplied <paramref name="modelResponse"/>.
/// </param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> that can cancel the operation.</param>
/// <returns>The evaluation prompt.</returns>
protected abstract ValueTask<string> RenderEvaluationPromptAsync(
ChatMessage? userRequest,
ChatResponse modelResponse,
IEnumerable<ChatMessage>? includedHistory,
IEnumerable<EvaluationContext>? additionalContext,
CancellationToken cancellationToken);
/// <summary>
/// Returns an <see cref="EvaluationResult"/> that includes default values for all the
/// <see cref="EvaluationMetric"/>s supported by this <see cref="IEvaluator"/>.
/// </summary>
/// <remarks>
/// The <see cref="EvaluationMetric.Name"/>s of the <see cref="EvaluationMetric"/>s contained in the returned
/// <see cref="EvaluationResult"/> should match <see cref="EvaluationMetricNames"/>.
/// </remarks>
/// <returns>
/// An <see cref="EvaluationResult"/> that includes default values for all the
/// <see cref="EvaluationMetric"/>s supported by this <see cref="IEvaluator"/>.
/// </returns>
protected abstract EvaluationResult InitializeResult();
/// <summary>
/// Invokes the supplied <see cref="ChatConfiguration.ChatClient"/> with the supplied
/// <paramref name="evaluationMessages"/> to perform the evaluation, and includes the results as one or more
/// <see cref="EvaluationMetric"/>s in the supplied <paramref name="result"/>.
/// </summary>
/// <param name="chatConfiguration">
/// A <see cref="ChatConfiguration"/> that specifies the <see cref="IChatClient"/> and the
/// <see cref="IEvaluationTokenCounter"/> that this <see cref="IEvaluator"/> uses to perform the evaluation.
/// </param>
/// <param name="evaluationMessages">
/// The set of messages that are to be sent to the supplied <see cref="ChatConfiguration.ChatClient"/> to perform
/// the evaluation.
/// </param>
/// <param name="result">
/// An <see cref="EvaluationResult"/> that includes a collection of <see cref="EvaluationMetric"/>s that are
/// supported by this <see cref="IEvaluator"/>.
/// </param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> that can cancel the operation.</param>
/// <returns>A <see cref="ValueTask"/> that represents the asynchronous operation.</returns>
protected abstract ValueTask PerformEvaluationAsync(
ChatConfiguration chatConfiguration,
IList<ChatMessage> evaluationMessages,
EvaluationResult result,
CancellationToken cancellationToken);
private (ChatMessage? userRequest, List<ChatMessage> history) GetUserRequestAndHistory(
IEnumerable<ChatMessage> messages)
{
ChatMessage? userRequest = null;
List<ChatMessage> history;
if (IgnoresHistory)
{
userRequest =
messages.LastOrDefault() is ChatMessage lastMessage && lastMessage.Role == ChatRole.User
? lastMessage
: null;
history = [];
}
else
{
history = [.. messages];
int lastMessageIndex = history.Count - 1;
if (lastMessageIndex >= 0 &&
history[lastMessageIndex] is ChatMessage lastMessage &&
lastMessage.Role == ChatRole.User)
{
userRequest = lastMessage;
history.RemoveAt(lastMessageIndex);
}
}
return (userRequest, history);
}
}
|