File: src\Shared\HttpRuleParser.cs
Web Access
Project: src\src\Http\Headers\src\Microsoft.Net.Http.Headers.csproj (Microsoft.Net.Http.Headers)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers;
using System.Diagnostics.Contracts;
using System.Globalization;
using System.Text;
using Microsoft.Extensions.Primitives;
 
namespace Microsoft.Net.Http.Headers;
 
internal static class HttpRuleParser
{
    // token = 1*<any CHAR except CTLs or separators>
    // CTL = <any US-ASCII control character (octets 0 - 31) and DEL (127)>
    private static readonly SearchValues<char> TokenChars =
        SearchValues.Create("!#$%&'*+-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz|~");
 
    private const int MaxNestedCount = 5;
    private static readonly string[] DateFormats = new string[]
    {
        // "r", // RFC 1123, required output format but too strict for input
        "ddd, d MMM yyyy H:m:s 'GMT'", // RFC 1123 (r, except it allows both 1 and 01 for date and time)
        "ddd, d MMM yyyy H:m:s", // RFC 1123, no zone - assume GMT
        "d MMM yyyy H:m:s 'GMT'", // RFC 1123, no day-of-week
        "d MMM yyyy H:m:s", // RFC 1123, no day-of-week, no zone
        "ddd, d MMM yy H:m:s 'GMT'", // RFC 1123, short year
        "ddd, d MMM yy H:m:s", // RFC 1123, short year, no zone
        "d MMM yy H:m:s 'GMT'", // RFC 1123, no day-of-week, short year
        "d MMM yy H:m:s", // RFC 1123, no day-of-week, short year, no zone
 
        "dddd, d'-'MMM'-'yy H:m:s 'GMT'", // RFC 850, short year
        "dddd, d'-'MMM'-'yy H:m:s", // RFC 850 no zone
        "ddd, d'-'MMM'-'yyyy H:m:s 'GMT'", // RFC 850, long year
        "ddd MMM d H:m:s yyyy", // ANSI C's asctime() format
 
        "ddd, d MMM yyyy H:m:s zzz", // RFC 5322
        "ddd, d MMM yyyy H:m:s", // RFC 5322 no zone
        "d MMM yyyy H:m:s zzz", // RFC 5322 no day-of-week
        "d MMM yyyy H:m:s", // RFC 5322 no day-of-week, no zone
    };
 
    internal const char CR = '\r';
    internal const char LF = '\n';
    internal const char SP = ' ';
    internal const char Tab = '\t';
    internal const int MaxInt64Digits = 19;
    internal const int MaxInt32Digits = 10;
 
    // iso-8859-1, Western European (ISO)
    internal static readonly Encoding DefaultHttpEncoding = Encoding.GetEncoding("iso-8859-1");
 
    [Pure]
    internal static int GetTokenLength(StringSegment input, int startIndex)
    {
        Contract.Ensures((Contract.Result<int>() >= 0) && (Contract.Result<int>() <= (input.Length - startIndex)));
 
        if (startIndex >= input.Length)
        {
            return 0;
        }
 
        var subspan = input.AsSpan(startIndex);
        var firstNonTokenCharIdx = subspan.IndexOfAnyExcept(TokenChars);
        return (firstNonTokenCharIdx == -1) ? subspan.Length : firstNonTokenCharIdx;
    }
 
    internal static int GetWhitespaceLength(StringSegment input, int startIndex)
    {
        Contract.Ensures((Contract.Result<int>() >= 0) && (Contract.Result<int>() <= (input.Length - startIndex)));
 
        if (startIndex >= input.Length)
        {
            return 0;
        }
 
        var current = startIndex;
 
        char c;
        while (current < input.Length)
        {
            c = input[current];
 
            if ((c == SP) || (c == Tab))
            {
                current++;
                continue;
            }
 
            if (c == CR)
            {
                // If we have a #13 char, it must be followed by #10 and then at least one SP or HT.
                if ((current + 2 < input.Length) && (input[current + 1] == LF))
                {
                    var spaceOrTab = input[current + 2];
                    if ((spaceOrTab == SP) || (spaceOrTab == Tab))
                    {
                        current += 3;
                        continue;
                    }
                }
            }
 
            return current - startIndex;
        }
 
        // All characters between startIndex and the end of the string are LWS characters.
        return input.Length - startIndex;
    }
 
    internal static int GetNumberLength(StringSegment input, int startIndex, bool allowDecimal)
    {
        Contract.Requires((startIndex >= 0) && (startIndex < input.Length));
        Contract.Ensures((Contract.Result<int>() >= 0) && (Contract.Result<int>() <= (input.Length - startIndex)));
 
        var current = startIndex;
        char c;
 
        // If decimal values are not allowed, we pretend to have read the '.' character already. I.e. if a dot is
        // found in the string, parsing will be aborted.
        var haveDot = !allowDecimal;
 
        // The RFC doesn't allow decimal values starting with dot. I.e. value ".123" is invalid. It must be in the
        // form "0.123". Also, there are no negative values defined in the RFC. So we'll just parse non-negative
        // values.
        // The RFC only allows decimal dots not ',' characters as decimal separators. Therefore value "1,23" is
        // considered invalid and must be represented as "1.23".
        if (input[current] == '.')
        {
            return 0;
        }
 
        while (current < input.Length)
        {
            c = input[current];
            if ((c >= '0') && (c <= '9'))
            {
                current++;
            }
            else if (!haveDot && (c == '.'))
            {
                // Note that value "1." is valid.
                haveDot = true;
                current++;
            }
            else
            {
                break;
            }
        }
 
        return current - startIndex;
    }
 
    internal static HttpParseResult GetQuotedStringLength(StringSegment input, int startIndex, out int length)
    {
        var nestedCount = 0;
        return GetExpressionLength(input, startIndex, '"', '"', false, ref nestedCount, out length);
    }
 
    // quoted-pair = "\" CHAR
    // CHAR = <any US-ASCII character (octets 0 - 127)>
    internal static HttpParseResult GetQuotedPairLength(StringSegment input, int startIndex, out int length)
    {
        Contract.Requires((startIndex >= 0) && (startIndex < input.Length));
        length = 0;
 
        if (input[startIndex] != '\\')
        {
            return HttpParseResult.NotParsed;
        }
 
        // Quoted-char has 2 characters. Check whether there are 2 chars left ('\' + char)
        // If so, check whether the character is in the range 0-127. If not, it's an invalid value.
        if ((startIndex + 2 > input.Length) || (input[startIndex + 1] > 127))
        {
            return HttpParseResult.InvalidFormat;
        }
 
        // We don't care what the char next to '\' is.
        length = 2;
        return HttpParseResult.Parsed;
    }
 
    // Try the various date formats in the order listed above.
    // We should accept a wide verity of common formats, but only output RFC 1123 style dates.
    internal static bool TryStringToDate(StringSegment input, out DateTimeOffset result)
    {
        ReadOnlySpan<char> inputSpan = input.AsSpan();
        DateTimeFormatInfo invariant = DateTimeFormatInfo.InvariantInfo;
 
        // RFC 1123 is most common and has a fast path in TryParseExact.  Try it first, and only if that
        // fails fall back to trying the rest of the formats.
        return
            DateTimeOffset.TryParseExact(inputSpan, "r", invariant, DateTimeStyles.None, out result) ||
            DateTimeOffset.TryParseExact(inputSpan, DateFormats, invariant, DateTimeStyles.AllowWhiteSpaces | DateTimeStyles.AssumeUniversal, out result);
    }
 
    // TEXT = <any OCTET except CTLs, but including LWS>
    // LWS = [CRLF] 1*( SP | HT )
    // CTL = <any US-ASCII control character (octets 0 - 31) and DEL (127)>
    //
    // Since we don't really care about the content of a quoted string or comment, we're more tolerant and
    // allow these characters. We only want to find the delimiters ('"' for quoted string and '(', ')' for comment).
    //
    // 'nestedCount': Comments can be nested. We allow a depth of up to 5 nested comments, i.e. something like
    // "(((((comment)))))". If we wouldn't define a limit an attacker could send a comment with hundreds of nested
    // comments, resulting in a stack overflow exception. In addition having more than 1 nested comment (if any)
    // is unusual.
    private static HttpParseResult GetExpressionLength(
        StringSegment input,
        int startIndex,
        char openChar,
        char closeChar,
        bool supportsNesting,
        ref int nestedCount,
        out int length)
    {
        Contract.Requires(input != null);
        Contract.Requires((startIndex >= 0) && (startIndex < input.Length));
 
        length = 0;
 
        if (input[startIndex] != openChar)
        {
            return HttpParseResult.NotParsed;
        }
 
        var current = startIndex + 1; // Start parsing with the character next to the first open-char
        while (current < input.Length)
        {
            // Only check whether we have a quoted char, if we have at least 3 characters left to read (i.e.
            // quoted char + closing char). Otherwise the closing char may be considered part of the quoted char.
            if ((current + 2 < input.Length) &&
                (GetQuotedPairLength(input, current, out var quotedPairLength) == HttpParseResult.Parsed))
            {
                // We ignore invalid quoted-pairs. Invalid quoted-pairs may mean that it looked like a quoted pair,
                // but we actually have a quoted-string: e.g. "\ü" ('\' followed by a char >127 - quoted-pair only
                // allows ASCII chars after '\'; qdtext allows both '\' and >127 chars).
                current = current + quotedPairLength;
                continue;
            }
 
            // If we support nested expressions and we find an open-char, then parse the nested expressions.
            if (supportsNesting && (input[current] == openChar))
            {
                nestedCount++;
                try
                {
                    // Check if we exceeded the number of nested calls.
                    if (nestedCount > MaxNestedCount)
                    {
                        return HttpParseResult.InvalidFormat;
                    }
 
                    var nestedLength = 0;
                    var nestedResult = GetExpressionLength(input, current, openChar, closeChar,
                        supportsNesting, ref nestedCount, out nestedLength);
 
                    switch (nestedResult)
                    {
                        case HttpParseResult.Parsed:
                            current += nestedLength; // add the length of the nested expression and continue.
                            break;
 
                        case HttpParseResult.NotParsed:
                            Contract.Assert(false, "'NotParsed' is unexpected: We started nested expression " +
                                "parsing, because we found the open-char. So either it's a valid nested " +
                                "expression or it has invalid format.");
                            break;
 
                        case HttpParseResult.InvalidFormat:
                            // If the nested expression is invalid, we can't continue, so we fail with invalid format.
                            return HttpParseResult.InvalidFormat;
 
                        default:
                            Contract.Assert(false, "Unknown enum result: " + nestedResult);
                            break;
                    }
                }
                finally
                {
                    nestedCount--;
                }
            }
 
            if (input[current] == closeChar)
            {
                length = current - startIndex + 1;
                return HttpParseResult.Parsed;
            }
            current++;
        }
 
        // We didn't see the final quote, therefore we have an invalid expression string.
        return HttpParseResult.InvalidFormat;
    }
}