File: Contents\DataUriParser.cs
Web Access
Project: src\src\Libraries\Microsoft.Extensions.AI.Abstractions\Microsoft.Extensions.AI.Abstractions.csproj (Microsoft.Extensions.AI.Abstractions)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
#if NET8_0_OR_GREATER
using System.Buffers.Text;
#endif
using System.Diagnostics;
using System.Net;
using System.Net.Http.Headers;
using System.Text;
 
namespace Microsoft.Extensions.AI;
 
/// <summary>
/// Minimal data URI parser based on RFC 2397: https://datatracker.ietf.org/doc/html/rfc2397.
/// </summary>
internal static class DataUriParser
{
    public static string Scheme => "data:";
 
    public static DataUri Parse(ReadOnlyMemory<char> dataUri)
    {
        // Validate, then trim off the "data:" scheme.
        if (!dataUri.Span.StartsWith(Scheme.AsSpan(), StringComparison.OrdinalIgnoreCase))
        {
            throw new UriFormatException("Invalid data URI format: the data URI must start with 'data:'.");
        }
 
        dataUri = dataUri.Slice(Scheme.Length);
 
        // Find the comma separating the metadata from the data.
        int commaPos = dataUri.Span.IndexOf(',');
        if (commaPos < 0)
        {
            throw new UriFormatException("Invalid data URI format: the data URI must contain a comma separating the metadata and the data.");
        }
 
        ReadOnlyMemory<char> metadata = dataUri.Slice(0, commaPos);
 
        ReadOnlyMemory<char> data = dataUri.Slice(commaPos + 1);
        bool isBase64 = false;
 
        // Determine whether the data is Base64-encoded or percent-encoded (Uri-encoded).
        // If it's base64-encoded, validate it. If it's Uri-encoded, there's nothing to validate,
        // as WebUtility.UrlDecode will successfully decode any input with no sequence considered invalid.
        if (metadata.Span.EndsWith(";base64".AsSpan(), StringComparison.OrdinalIgnoreCase))
        {
            metadata = metadata.Slice(0, metadata.Length - ";base64".Length);
            isBase64 = true;
            if (!IsValidBase64Data(data.Span))
            {
                throw new UriFormatException("Invalid data URI format: the data URI is base64-encoded, but the data is not a valid base64 string.");
            }
        }
 
        // Validate the media type, if present.
        string? mediaType = null;
        if (!IsValidMediaType(metadata.Span.Trim(), ref mediaType))
        {
            throw new UriFormatException("Invalid data URI format: the media type is not a valid.");
        }
 
        return new DataUri(data, isBase64, mediaType);
    }
 
    /// <summary>Validates that a media type is valid, and if successful, ensures we have it as a string.</summary>
    public static bool IsValidMediaType(ReadOnlySpan<char> mediaTypeSpan, ref string? mediaType)
    {
        Debug.Assert(
            mediaType is null || mediaTypeSpan.Equals(mediaType.AsSpan(), StringComparison.Ordinal),
            "mediaType string should either be null or the same as the span");
 
        // If the media type is empty or all whitespace, normalize it to null.
        if (mediaTypeSpan.IsWhiteSpace())
        {
            mediaType = null;
            return true;
        }
 
        // For common media types, we can avoid both allocating a string for the span and avoid parsing overheads.
        string? knownType = mediaTypeSpan switch
        {
            "application/json" => "application/json",
            "application/octet-stream" => "application/octet-stream",
            "application/pdf" => "application/pdf",
            "application/xml" => "application/xml",
            "audio/mpeg" => "audio/mpeg",
            "audio/ogg" => "audio/ogg",
            "audio/wav" => "audio/wav",
            "image/apng" => "image/apng",
            "image/avif" => "image/avif",
            "image/bmp" => "image/bmp",
            "image/gif" => "image/gif",
            "image/jpeg" => "image/jpeg",
            "image/png" => "image/png",
            "image/svg+xml" => "image/svg+xml",
            "image/tiff" => "image/tiff",
            "image/webp" => "image/webp",
            "text/css" => "text/css",
            "text/csv" => "text/csv",
            "text/html" => "text/html",
            "text/javascript" => "text/javascript",
            "text/plain" => "text/plain",
            "text/plain;charset=UTF-8" => "text/plain;charset=UTF-8",
            "text/xml" => "text/xml",
            _ => null,
        };
        if (knownType is not null)
        {
            mediaType ??= knownType;
            return true;
        }
 
        // Otherwise, do the full validation using the same logic as HttpClient.
        mediaType ??= mediaTypeSpan.ToString();
        return MediaTypeHeaderValue.TryParse(mediaType, out _);
    }
 
    /// <summary>Test whether the value is a base64 string without whitespace.</summary>
    private static bool IsValidBase64Data(ReadOnlySpan<char> value)
    {
        if (value.IsEmpty)
        {
            return true;
        }
 
#if NET8_0_OR_GREATER
        return Base64.IsValid(value) && !value.ContainsAny(" \t\r\n");
#else
#pragma warning disable S109 // Magic numbers should not be used
        if (value!.Length % 4 != 0)
#pragma warning restore S109
        {
            return false;
        }
 
        var index = value.Length - 1;
 
        // Step back over one or two padding chars
        if (value[index] == '=')
        {
            index--;
        }
 
        if (value[index] == '=')
        {
            index--;
        }
 
        // Now traverse over characters
        for (var i = 0; i <= index; i++)
        {
#pragma warning disable S1067 // Expressions should not be too complex
            bool validChar = value[i] is (>= 'A' and <= 'Z') or (>= 'a' and <= 'z') or (>= '0' and <= '9') or '+' or '/';
#pragma warning restore S1067
            if (!validChar)
            {
                return false;
            }
        }
 
        return true;
#endif
    }
 
    /// <summary>Provides the parts of a parsed data URI.</summary>
    public sealed class DataUri(ReadOnlyMemory<char> data, bool isBase64, string? mediaType)
    {
#pragma warning disable S3604 // False positive: Member initializer values should not be redundant
        public string? MediaType { get; } = mediaType;
 
        public ReadOnlyMemory<char> Data { get; } = data;
 
        public bool IsBase64 { get; } = isBase64;
#pragma warning restore S3604
 
        public byte[] ToByteArray() => IsBase64 ?
            Convert.FromBase64String(Data.ToString()) :
            Encoding.UTF8.GetBytes(WebUtility.UrlDecode(Data.ToString()));
    }
}