File: ReferenceParser.cs
Web Access
Project: ..\..\..\src\Containers\Microsoft.NET.Build.Containers\Microsoft.NET.Build.Containers.csproj (Microsoft.NET.Build.Containers)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Text.RegularExpressions;
 
namespace Microsoft.NET.Build.Containers;
 
/// <summary>
/// This class is a port of the regex patterns described in the regexp.go file in the OCI Registry spec repo distribution/distribution.
/// It is current as of SHA 78b9c98c5c31c30d74f9acb7d96f98552f2cf78f.
/// <see href="https://github.com/distribution/distribution/blob/78b9c98c5c31c30d74f9acb7d96f98552f2cf78f/reference/regexp.go">regexp.go</see> is the direct file link.
/// Comments on each member are lifted directly from this source.
/// Names of each member are deliberately non-.NET-standard, as they were kept aligned with their golang versions for easier comparison.
/// Visibility of each member is determined by golang rules - lowercase is private, uppercase is public. The exception is when a private member is used inside the golang module.
/// </summary>
internal static class ReferenceParser
{
 
    /// <summary>
    /// alphaNumeric defines the alpha numeric atom, typically a
    /// component of names. This only allows lower case characters and digits.
    /// </summary>
    private static readonly string alphaNumeric = @"[a-z0-9]+";
 
    /// <summary>
    /// separator defines the separators allowed to be embedded in name
    /// components. This allow one period, one or two underscore and multiple
    /// dashes. Repeated dashes and underscores are intentionally treated
    /// differently. In order to support valid hostnames as name components,
    /// supporting repeated dash was added. Additionally double underscore is
    /// now allowed as a separator to loosen the restriction for previously
    /// supported names.
    /// </summary>
    private static readonly string separator = @"(?:[._]|__|[-]*)";
 
    /// <summary>
    /// nameComponent restricts registry path component names to start
    /// with at least one letter or number, with following parts able to be
    /// separated by one period, one or two underscore and multiple dashes.
    /// </summary>
    private static readonly string nameComponent = expression(alphaNumeric, optional(repeated(separator, alphaNumeric)));
 
    /// <summary>
    /// domainNameComponent restricts the registry domain component of a
    /// repository name to start with a component as defined by DomainRegexp.
    /// </summary>
    private static readonly string domainNameComponent = @"(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])";
 
    /// <summary>
    /// ipv6address are enclosed between square brackets and may be represented
    /// in many ways, see rfc5952. Only IPv6 in compressed or uncompressed format
    /// are allowed, IPv6 zone identifiers (rfc6874) or Special addresses such as
    /// IPv4-Mapped are deliberately excluded.
    /// </summary>
    private static readonly string ipv6address = expression(
        literal("["),
        @"(?:[a-fA-F0-9:]+)",
        literal("]")
    );
 
    /// <summary>
    /// domainName defines the structure of potential domain components
    /// that may be part of image names. This is purposely a subset of what is
    /// allowed by DNS to ensure backwards compatibility with Docker image
    /// names. This includes IPv4 addresses on decimal format.
    /// </summary>
    private static readonly string domainName = expression(
        domainNameComponent,
        // Require two name domainNameComponents to prevent matching a domain when parsing the short containername: 'ubuntu/runtime'.
        literal("."), domainNameComponent,
        optional(repeated(literal("."), domainNameComponent))
    );
 
    /// <summary>
    /// host defines the structure of potential domains based on the URI
    /// Host subcomponent on rfc3986. It may be a subset of DNS domain name,
    /// or an IPv4 address in decimal format, or an IPv6 address between square
    /// brackets (excluding zone identifiers as defined by rfc6874 or special
    /// addresses such as IPv4-Mapped).
    /// </summary>
    private static readonly string host = $"(?:{domainName}|{ipv6address}|localhost)";
 
    /// <summary>
    /// allowed by the URI Host subcomponent on rfc3986 to ensure backwards
    /// compatibility with Docker image names.
    /// </summary>
    private static readonly string domain = expression(
        host,
        optional(literal(":"), "[0-9]+")
    );
 
    /// <summary>
    /// DomainRegexp defines the structure of potential domain components
    /// that may be part of image names. This is purposely a subset of what is
    /// allowed by DNS to ensure backwards compatibility with Docker image
    /// names.
    /// </summary>
    public static readonly Regex DomainRegexp = new(domain);
 
    /// <summary>
    /// This is a custom addition - we needed domain-part validation for a string we _knew_ was anchored,
    /// so this was included as a slight addition to the source material.
    /// </summary>
    public static readonly Regex AnchoredDomainRegexp = new(anchored(domain));
 
    /// <summary>
    /// valid tags are a word character followed by 0-127 subsequent word, dot, or dash characters.
    /// </summary>
    private static readonly string tag = @"[\w][\w.-]{0,127}";
 
    /// <summary>
    /// TagRegexp matches valid tag names. From docker/docker:graph/tags.go.
    /// </summary>
    public static readonly Regex TagRegexp = new(tag);
 
    /// <summary>
    // anchoredTag matches valid tag names, anchored at the start and
    // end of the matched string.
    /// </summary>
    private static readonly string anchoredTag = anchored(tag);
 
    /// <summary>
    /// anchoredTagRegexp matches valid tag names, anchored at the start and
    /// end of the matched string.
    /// </summary>
    public static readonly Regex anchoredTagRegexp = new(anchoredTag);
 
    /// <summary>
    /// needed because the original golang used `[[:xdigit:]]` which .Net doesn't support.
    /// `[[:xdigit:]]` is the set of valid hex digits, which is 0-9, a-f, and A-F.
    /// </summary>
    private static readonly string hexDigit = "[0-9A-Fa-f]";
 
    /// <summary>
    /// digestPat matches valid digests.
    /// </summary>
    private static readonly string digestPat = $"[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:]{hexDigit}{{32,}}";
 
    /// <summary>
    /// DigestRegexp matches valid digests.
    /// </summary>
    public static readonly Regex DigestRegexp = new(digestPat);
 
    /// <summary>
    /// anchoredDigest matches valid digests, anchored at the start and
    /// end of the matched string.
    /// </summary>
    private static readonly string anchoredDigest = anchored(digestPat);
 
    /// <summary>
    /// anchoredDigestRegexp matches valid digests, anchored at the start and
    /// end of the matched string.
    /// </summary>
    private static readonly Regex anchoredDigestRegexp = new(anchoredDigest);
 
    /// <summary>
    /// namePat is the format for the name component of references. The
    /// regexp has capturing groups for the domain and name part omitting
    /// the separating forward slash from either.
    /// </summary>
    private static readonly string namePat = expression(
        optional(domain, literal("/")),
        nameComponent,
        optional(repeated(literal("/"), nameComponent))
    );
 
    /// <summary>
    /// NameRegexp is the format for the name component of references. Logically this consists of
    /// a domain portion followed by one or more /-delimited name components.
    /// </summary>
    public static readonly Regex NameRegexp = new(namePat);
 
    /// <summary>
    /// anchoredNameRegexp is used to parse a name value, capturing the
    /// domain and trailing components.
    /// </summary>
    private static readonly string anchoredName = anchored(
        optional(capture(domain), literal("/")),
        capture(nameComponent, optional(repeated(literal("/"), nameComponent)))
    );
 
    /// <summary>
    /// anchoredNameRegexp is used to parse a name value, capturing the
    /// domain and trailing components.
    /// </summary>
    public static readonly Regex anchoredNameRegexp = new(anchoredName);
 
    /// <summary>
    /// referencePat is the full supported format of a reference. The regexp
    /// is anchored and has capturing groups for name, tag, and digest
    /// components.
    /// </summary>
    private static string referencePat = anchored(
        capture(namePat),
        optional(literal(":"), capture(tag)),
        optional(literal("@"), capture(digestPat))
    );
 
    /// <summary>
    /// ReferenceRegexp is the full supported format of a reference. The regexp
    /// is anchored and has capturing groups for name, tag, and digest
    /// components.
    /// </summary>
    public static readonly Regex ReferenceRegexp = new(referencePat);
 
    /// <summary>
    /// identifier is the format for string identifier used as a
    /// content addressable identifier using sha256. These identifiers
    /// are like digests without the algorithm, since sha256 is used.
    /// </summary>
    private static readonly string identifier = @"([a-f0-9]{64})";
 
    /// <summary>
    /// IdentifierRegexp is the format for string identifier used as a
    /// content addressable identifier using sha256. These identifiers
    /// are like digests without the algorithm, since sha256 is used.
    /// </summary>
    public static readonly Regex IdentifierRegexp = new(identifier);
 
    /// <summary>
    /// shortIdentifier is the format used to represent a prefix
    /// of an identifier. A prefix may be used to match a sha256 identifier
    /// within a list of trusted identifiers.
    /// </summary>
    private static readonly string shortIdentifier = @"([a-f0-9]{6,64})";
 
    /// <summary>
    /// ShortIdentifierRegexp is the format used to represent a prefix
    /// of an identifier. A prefix may be used to match a sha256 identifier
    /// within a list of trusted identifiers.
    /// </summary>
    public static readonly Regex ShortIdentifierRegexp = new(shortIdentifier);
 
    /// <summary>
    /// anchoredIdentifier is used to check or match an
    /// identifier value, anchored at start and end of string.
    /// </summary>
    private static readonly string anchoredIdentifier = anchored(identifier);
 
    /// <summary>
    /// anchoredIdentifierRegexp is used to check or match an
    /// identifier value, anchored at start and end of string.
    /// </summary>
    private static readonly Regex anchoredIdentifierRegexp = new(anchoredIdentifier);
 
    /// <summary>
    /// anchoredShortIdentifier is used to check if a value
    /// is a possible identifier prefix, anchored at start and end
    /// of string.
    /// </summary>
    private static readonly string anchoredShortIdentifier = anchored(shortIdentifier);
 
    /// <summary>
    /// anchoredShortIdentifierRegexp is used to check if a value
    /// is a possible identifier prefix, anchored at start and end
    /// of string.
    /// </summary>
    private static readonly Regex anchoredShortIdentifierRegexp = new(anchoredShortIdentifier);
 
    /// <summary>
    /// literal compiles s into a literal regular expression, escaping any regexp
    /// reserved characters.
    /// </summary>
    /// <remarks>we use a simpler implementation than the golang source since Regex.Escape seems to do the job</remarks>
    private static string literal(string s) => Regex.Escape(s);
 
    /// <summary>
    /// expression defines a full expression, where each regular expression must
    /// follow the previous.
    /// </summary>
    private static string expression(params string[] segments)
    {
        var b = new StringBuilder();
        foreach (var s in segments)
        {
            b.Append(s);
        }
        return b.ToString();
    }
 
    /// <summary>
    /// optional wraps the expression in a non-capturing group and makes the
    /// production optional.
    /// </summary>
    private static string optional(params string[] segments) => $"{group(expression(segments))}?";
 
    /// <summary>
    /// repeated wraps the regexp in a non-capturing group to get one or more
    /// matches.
    /// </summary>
    private static string repeated(params string[] segments) => $"{group(expression(segments))}+";
 
    /// <summary>
    /// group wraps the regexp in a non-capturing group.
    /// </summary>
    private static string group(params string[] segments) => $"(?:{expression(segments)})";
 
    /// <summary>
    /// capture wraps the expression in a capturing group.
    /// </summary>
    private static string capture(params string[] segments) => $"({expression(segments)})";
 
    /// <summary>
    /// anchored anchors the regular expression by adding start and end delimiters.
    /// </summary>
    private static string anchored(params string[] segments) => $"^{expression(segments)}$";
}