File: ReferenceParser.cs
Web Access
Project: src\src\sdk\src\Containers\Microsoft.NET.Build.Containers\Microsoft.NET.Build.Containers.csproj (Microsoft.NET.Build.Containers)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Text.RegularExpressions;

namespace Microsoft.NET.Build.Containers;

/// <summary>
/// This class is a port of the regex patterns described in the regexp.go file in the OCI Registry spec repo distribution/distribution.
/// It is current as of SHA 78b9c98c5c31c30d74f9acb7d96f98552f2cf78f.
/// <see href="https://github.com/distribution/distribution/blob/78b9c98c5c31c30d74f9acb7d96f98552f2cf78f/reference/regexp.go">regexp.go</see> is the direct file link.
/// Comments on each member are lifted directly from this source.
/// Names of each member are deliberately non-.NET-standard, as they were kept aligned with their golang versions for easier comparison.
/// Visibility of each member is determined by golang rules - lowercase is private, uppercase is public. The exception is when a private member is used inside the golang module.
/// </summary>
internal static class ReferenceParser
{

    /// <summary>
    /// alphaNumeric defines the alpha numeric atom, typically a
    /// component of names. This only allows lower case characters and digits.
    /// </summary>
    private static readonly string alphaNumeric = @"[a-z0-9]+";

    /// <summary>
    /// separator defines the separators allowed to be embedded in name
    /// components. This allow one period, one or two underscore and multiple
    /// dashes. Repeated dashes and underscores are intentionally treated
    /// differently. In order to support valid hostnames as name components,
    /// supporting repeated dash was added. Additionally double underscore is
    /// now allowed as a separator to loosen the restriction for previously
    /// supported names.
    /// </summary>
    private static readonly string separator = @"(?:[._]|__|[-]*)";

    /// <summary>
    /// nameComponent restricts registry path component names to start
    /// with at least one letter or number, with following parts able to be
    /// separated by one period, one or two underscore and multiple dashes.
    /// </summary>
    private static readonly string nameComponent = expression(alphaNumeric, optional(repeated(separator, alphaNumeric)));

    /// <summary>
    /// domainNameComponent restricts the registry domain component of a
    /// repository name to start with a component as defined by DomainRegexp.
    /// </summary>
    private static readonly string domainNameComponent = @"(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])";

    /// <summary>
    /// ipv6address are enclosed between square brackets and may be represented
    /// in many ways, see rfc5952. Only IPv6 in compressed or uncompressed format
    /// are allowed, IPv6 zone identifiers (rfc6874) or Special addresses such as
    /// IPv4-Mapped are deliberately excluded.
    /// </summary>
    private static readonly string ipv6address = expression(
        literal("["),
        @"(?:[a-fA-F0-9:]+)",
        literal("]")
    );

    /// <summary>
    /// domainName defines the structure of potential domain components
    /// that may be part of image names. This is purposely a subset of what is
    /// allowed by DNS to ensure backwards compatibility with Docker image
    /// names. This includes IPv4 addresses on decimal format.
    /// </summary>
    private static readonly string domainName = expression(
        domainNameComponent,
        // Require two name domainNameComponents to prevent matching a domain when parsing the short containername: 'ubuntu/runtime'.
        literal("."), domainNameComponent,
        optional(repeated(literal("."), domainNameComponent))
    );

    /// <summary>
    /// host defines the structure of potential domains based on the URI
    /// Host subcomponent on rfc3986. It may be a subset of DNS domain name,
    /// or an IPv4 address in decimal format, or an IPv6 address between square
    /// brackets (excluding zone identifiers as defined by rfc6874 or special
    /// addresses such as IPv4-Mapped).
    /// </summary>
    private static readonly string host = $"(?:{domainName}|{ipv6address}|localhost)";

    /// <summary>
    /// allowed by the URI Host subcomponent on rfc3986 to ensure backwards
    /// compatibility with Docker image names.
    /// </summary>
    private static readonly string domain = expression(
        host,
        optional(literal(":"), "[0-9]+")
    );

    /// <summary>
    /// DomainRegexp defines the structure of potential domain components
    /// that may be part of image names. This is purposely a subset of what is
    /// allowed by DNS to ensure backwards compatibility with Docker image
    /// names.
    /// </summary>
    public static readonly Regex DomainRegexp = new(domain);

    /// <summary>
    /// This is a custom addition - we needed domain-part validation for a string we _knew_ was anchored,
    /// so this was included as a slight addition to the source material.
    /// </summary>
    public static readonly Regex AnchoredDomainRegexp = new(anchored(domain));

    /// <summary>
    /// valid tags are a word character followed by 0-127 subsequent word, dot, or dash characters.
    /// </summary>
    private static readonly string tag = @"[\w][\w.-]{0,127}";

    /// <summary>
    /// TagRegexp matches valid tag names. From docker/docker:graph/tags.go.
    /// </summary>
    public static readonly Regex TagRegexp = new(tag);

    /// <summary>
    // anchoredTag matches valid tag names, anchored at the start and
    // end of the matched string.
    /// </summary>
    private static readonly string anchoredTag = anchored(tag);

    /// <summary>
    /// anchoredTagRegexp matches valid tag names, anchored at the start and
    /// end of the matched string.
    /// </summary>
    public static readonly Regex anchoredTagRegexp = new(anchoredTag);

    /// <summary>
    /// This is a custom addition - <see cref="digestPat"/> was decomposed into
    /// its algorithm and encoded components to allow for capturing group
    /// variants. These components follow the OCI image-spec digest grammar
    /// rather than the more restrictive upstream implementation
    /// (see <see href="https://github.com/distribution/reference/blob/main/regexp.go">distribution/reference</see>),
    /// which only allows hex-encoded digests with a minimum of 32 characters
    /// and requires algorithm components to start with a letter.
    ///
    /// See also:
    /// <see href="https://github.com/distribution/reference/blob/c577ba6addb2513df8704d4663b26c60eb0ac3f6/regexp.go#L70-L78">
    /// upstream TODO acknowledging this divergence
    /// </see>.
    ///
    /// <see cref="digestAlgorithmComponent"/> matches the algorithm portion of
    /// a digest (e.g. "sha256", "sha512", "multihash+base58"). Per OCI spec,
    /// algorithm components are lowercase alphanumeric.
    ///
    /// <see href="https://github.com/opencontainers/image-spec/blob/a4c6ade7bb82b316d45391f572727a63e268b252/descriptor.md#digests">
    /// OCI descriptor digest grammar
    /// </see>
    /// </summary>
    private static readonly string digestAlgorithmComponent = @"[a-z0-9]+(?:[+._-][a-z0-9]+)*";

    /// <summary>
    /// This is a custom addition - see <see cref="digestAlgorithmComponent"/>.
    /// <see cref="digestEncodedComponent"/> matches the encoded portion of a
    /// digest. Per OCI spec, this allows alphanumeric characters plus "=",
    /// "_", and "-" to support encodings beyond hex (e.g. base64url, base58).
    ///
    /// <see href="https://github.com/opencontainers/image-spec/blob/a4c6ade7bb82b316d45391f572727a63e268b252/descriptor.md#digests">
    /// OCI descriptor digest grammar
    /// </see>
    /// </summary>
    private static readonly string digestEncodedComponent = @"[a-zA-Z0-9=_-]+";

    /// <summary>
    /// digestPat matches valid digests.
    /// </summary>
    private static readonly string digestPat = $"{digestAlgorithmComponent}[:]{digestEncodedComponent}";

    /// <summary>
    /// DigestRegexp matches valid digests.
    /// </summary>
    public static readonly Regex DigestRegexp = new(digestPat);

    /// <summary>
    /// <see cref="AnchoredDigestRegexp"/> matches valid digests, anchored at
    /// the start and end of the matched string. This is based on the upstream
    /// anchoredDigestRegexp, with capturing groups added for the algorithm
    /// (group 1) and the encoded value (group 2).
    ///
    /// <see href="https://github.com/opencontainers/image-spec/blob/a4c6ade7bb82b316d45391f572727a63e268b252/descriptor.md#digests">OCI descriptor digest grammar</see>
    /// </summary>
    public static readonly Regex AnchoredDigestRegexp = new(anchored(
        capture(digestAlgorithmComponent),
        literal(":"),
        capture(digestEncodedComponent)
    ));

    /// <summary>
    /// namePat is the format for the name component of references. The
    /// regexp has capturing groups for the domain and name part omitting
    /// the separating forward slash from either.
    /// </summary>
    private static readonly string namePat = expression(
        optional(domain, literal("/")),
        nameComponent,
        optional(repeated(literal("/"), nameComponent))
    );

    /// <summary>
    /// NameRegexp is the format for the name component of references. Logically this consists of
    /// a domain portion followed by one or more /-delimited name components.
    /// </summary>
    public static readonly Regex NameRegexp = new(namePat);

    /// <summary>
    /// anchoredNameRegexp is used to parse a name value, capturing the
    /// domain and trailing components.
    /// </summary>
    private static readonly string anchoredName = anchored(
        optional(capture(domain), literal("/")),
        capture(nameComponent, optional(repeated(literal("/"), nameComponent)))
    );

    /// <summary>
    /// anchoredNameRegexp is used to parse a name value, capturing the
    /// domain and trailing components.
    /// </summary>
    public static readonly Regex anchoredNameRegexp = new(anchoredName);

    /// <summary>
    /// referencePat is the full supported format of a reference. The regexp
    /// is anchored and has capturing groups for name, tag, and digest
    /// components.
    /// </summary>
    private static string referencePat = anchored(
        capture(namePat),
        optional(literal(":"), capture(tag)),
        optional(literal("@"), capture(digestPat))
    );

    /// <summary>
    /// ReferenceRegexp is the full supported format of a reference. The regexp
    /// is anchored and has capturing groups for name, tag, and digest
    /// components.
    /// </summary>
    public static readonly Regex ReferenceRegexp = new(referencePat);

    /// <summary>
    /// identifier is the format for string identifier used as a
    /// content addressable identifier using sha256. These identifiers
    /// are like digests without the algorithm, since sha256 is used.
    /// </summary>
    private static readonly string identifier = @"([a-f0-9]{64})";

    /// <summary>
    /// IdentifierRegexp is the format for string identifier used as a
    /// content addressable identifier using sha256. These identifiers
    /// are like digests without the algorithm, since sha256 is used.
    /// </summary>
    public static readonly Regex IdentifierRegexp = new(identifier);

    /// <summary>
    /// shortIdentifier is the format used to represent a prefix
    /// of an identifier. A prefix may be used to match a sha256 identifier
    /// within a list of trusted identifiers.
    /// </summary>
    private static readonly string shortIdentifier = @"([a-f0-9]{6,64})";

    /// <summary>
    /// ShortIdentifierRegexp is the format used to represent a prefix
    /// of an identifier. A prefix may be used to match a sha256 identifier
    /// within a list of trusted identifiers.
    /// </summary>
    public static readonly Regex ShortIdentifierRegexp = new(shortIdentifier);

    /// <summary>
    /// anchoredIdentifier is used to check or match an
    /// identifier value, anchored at start and end of string.
    /// </summary>
    private static readonly string anchoredIdentifier = anchored(identifier);

    /// <summary>
    /// anchoredIdentifierRegexp is used to check or match an
    /// identifier value, anchored at start and end of string.
    /// </summary>
    private static readonly Regex anchoredIdentifierRegexp = new(anchoredIdentifier);

    /// <summary>
    /// anchoredShortIdentifier is used to check if a value
    /// is a possible identifier prefix, anchored at start and end
    /// of string.
    /// </summary>
    private static readonly string anchoredShortIdentifier = anchored(shortIdentifier);

    /// <summary>
    /// anchoredShortIdentifierRegexp is used to check if a value
    /// is a possible identifier prefix, anchored at start and end
    /// of string.
    /// </summary>
    private static readonly Regex anchoredShortIdentifierRegexp = new(anchoredShortIdentifier);

    /// <summary>
    /// literal compiles s into a literal regular expression, escaping any regexp
    /// reserved characters.
    /// </summary>
    /// <remarks>we use a simpler implementation than the golang source since Regex.Escape seems to do the job</remarks>
    private static string literal(string s) => Regex.Escape(s);

    /// <summary>
    /// expression defines a full expression, where each regular expression must
    /// follow the previous.
    /// </summary>
    private static string expression(params string[] segments)
    {
        var b = new StringBuilder();
        foreach (var s in segments)
        {
            b.Append(s);
        }
        return b.ToString();
    }

    /// <summary>
    /// optional wraps the expression in a non-capturing group and makes the
    /// production optional.
    /// </summary>
    private static string optional(params string[] segments) => $"{group(expression(segments))}?";

    /// <summary>
    /// repeated wraps the regexp in a non-capturing group to get one or more
    /// matches.
    /// </summary>
    private static string repeated(params string[] segments) => $"{group(expression(segments))}+";

    /// <summary>
    /// group wraps the regexp in a non-capturing group.
    /// </summary>
    private static string group(params string[] segments) => $"(?:{expression(segments)})";

    /// <summary>
    /// capture wraps the expression in a capturing group.
    /// </summary>
    private static string capture(params string[] segments) => $"({expression(segments)})";

    /// <summary>
    /// anchored anchors the regular expression by adding start and end delimiters.
    /// </summary>
    private static string anchored(params string[] segments) => $"^{expression(segments)}$";
}