RegexParser.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;

namespace System.Text.RegularExpressions
{
    /// <summary>Builds a tree of RegexNodes from a regular expression.</summary>
    internal ref struct RegexParser
    {
        // Implementation notes:
        // It would be nice to get rid of the comment modes, since the
        // ScanBlank() calls are just kind of duct-taped in.

        private const int EscapeMaxBufferSize = 256;
        private const int OptionStackDefaultSize = 32;
        private const int MaxValueDiv10 = int.MaxValue / 10;
        private const int MaxValueMod10 = int.MaxValue % 10;

        private RegexNode? _stack;
        private RegexNode? _group;
        private RegexNode? _alternation;
        private RegexNode? _concatenation;
        private RegexNode? _unit;

        private readonly string _pattern;
        private int _pos;
        private readonly CultureInfo _culture;
        private RegexCaseBehavior _caseBehavior;
        private bool _hasIgnoreCaseBackreferenceNodes;

        private int _autocap;
        private int _capcount;
        private int _captop;
        private readonly int _capsize;

        private readonly Hashtable _caps;
        private Hashtable? _capnames;

        private int[]? _capnumlist;
        private List<string>? _capnamelist;

        private RegexOptions _options;
        // NOTE: _optionsStack is ValueListBuilder<int> to ensure that
        //       ArrayPool<int>.Shared, not ArrayPool<RegexOptions>.Shared,
        //       will be created if the stackalloc'd capacity is ever exceeded.
        private ValueListBuilder<int> _optionsStack;

        private bool _ignoreNextParen; // flag to skip capturing a parentheses group

        private RegexParser(string pattern, RegexOptions options, CultureInfo culture, Hashtable caps, int capsize, Hashtable? capnames, Span<int> optionSpan)
        {
            Debug.Assert(pattern != null, "Pattern must be set");
            Debug.Assert(culture != null, "Culture must be set");

            _pattern = pattern;
            _options = options;
            _culture = culture;
            _caseBehavior = default;
            _hasIgnoreCaseBackreferenceNodes = false;
            _caps = caps;
            _capsize = capsize;
            _capnames = capnames;

            _optionsStack = new ValueListBuilder<int>(optionSpan);
            _stack = null;
            _group = null;
            _alternation = null;
            _concatenation = null;
            _unit = null;
            _pos = 0;
            _autocap = 0;
            _capcount = 0;
            _captop = 0;
            _capnumlist = null;
            _capnamelist = null;
            _ignoreNextParen = false;
        }

        /// <summary>Gets the culture to use based on the specified options.</summary>
        internal static CultureInfo GetTargetCulture(RegexOptions options) =>
#pragma warning disable RS1035 // The symbol 'CultureInfo.CurrentCulture' is banned for use by analyzers.
            (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
#pragma warning restore RS1035

        public static unsafe RegexOptions ParseOptionsInPattern(string pattern, RegexOptions options)
        {
            using var parser = new RegexParser(pattern, options, CultureInfo.InvariantCulture, // since we won't perform case conversions, culture doesn't matter in this case.
                new Hashtable(), 0, null, stackalloc int[OptionStackDefaultSize]);

            // We don't really need to Count the Captures, but this method will already do a quick
            // pass through the pattern, and will scan the options found and return them as an out
            // parameter, so we use that to get out the pattern inline options.
            parser.CountCaptures(out RegexOptions foundOptionsInPattern);
            parser.Reset(options);
            return foundOptionsInPattern;
        }

        public static unsafe RegexTree Parse(string pattern, RegexOptions options, CultureInfo culture)
        {
            using var parser = new RegexParser(pattern, options, culture, new Hashtable(), 0, null, stackalloc int[OptionStackDefaultSize]);

            parser.CountCaptures(out _);
            parser.Reset(options);
            RegexNode root = parser.ScanRegex();

            int[]? captureNumberList = parser._capnumlist;
            Hashtable? sparseMapping = parser._caps;
            int captop = parser._captop;

            int captureCount;
            if (captureNumberList == null || captop == captureNumberList.Length)
            {
                // The capture list isn't sparse.  Null out the capture mapping as it's not necessary,
                // and store the number of captures.
                captureCount = captop;
                sparseMapping = null;
            }
            else
            {
                // The capture list is sparse.  Store the number of captures, and populate the number-to-names-list.
                captureCount = captureNumberList.Length;
                for (int i = 0; i < captureNumberList.Length; i++)
                {
                    sparseMapping[captureNumberList[i]] = i;
                }
            }

            return new RegexTree(root, captureCount, parser._capnamelist?.ToArray(), parser._capnames!, sparseMapping, options, parser._hasIgnoreCaseBackreferenceNodes ? culture : null);
        }

        /// <summary>This static call constructs a flat concatenation node given a replacement pattern.</summary>
        public static unsafe RegexReplacement ParseReplacement(string pattern, RegexOptions options, Hashtable caps, int capsize, Hashtable capnames)
        {
#pragma warning disable RS1035 // The symbol 'CultureInfo.CurrentCulture' is banned for use by analyzers.
            CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
#pragma warning restore RS1035
            using var parser = new RegexParser(pattern, options, culture, caps, capsize, capnames, stackalloc int[OptionStackDefaultSize]);

            RegexNode root = parser.ScanReplacement();
            var regexReplacement = new RegexReplacement(pattern, root, caps);

            return regexReplacement;
        }

        /// <summary>Escapes all metacharacters (including |,(,),[,{,|,^,$,*,+,?,\, spaces and #)</summary>
        public static string Escape(string input)
        {
            int indexOfMetachar = input.AsSpan().IndexOfAny(s_metachars);
            return indexOfMetachar < 0
                ? input
                : EscapeImpl(input.AsSpan(), indexOfMetachar);
        }

        private static unsafe string EscapeImpl(ReadOnlySpan<char> input, int indexOfMetachar)
        {
            // For small inputs we allocate on the stack. In most cases a buffer three
            // times larger the original string should be sufficient as usually not all
            // characters need to be encoded.
            // For larger string we rent the input string's length plus a fixed
            // conservative amount of chars from the ArrayPool.
            ValueStringBuilder vsb = input.Length <= (EscapeMaxBufferSize / 3) ?
                new ValueStringBuilder(stackalloc char[EscapeMaxBufferSize]) :
                new ValueStringBuilder(input.Length + 200);

            while (true)
            {
                vsb.Append(input.Slice(0, indexOfMetachar));
                input = input.Slice(indexOfMetachar);

                if (input.IsEmpty)
                {
                    break;
                }

                char ch = input[0];

                switch (ch)
                {
                    case '\n':
                        ch = 'n';
                        break;
                    case '\r':
                        ch = 'r';
                        break;
                    case '\t':
                        ch = 't';
                        break;
                    case '\f':
                        ch = 'f';
                        break;
                }

                vsb.Append('\\');
                vsb.Append(ch);
                input = input.Slice(1);

                indexOfMetachar = input.IndexOfAny(s_metachars);
                if (indexOfMetachar < 0)
                {
                    indexOfMetachar = input.Length;
                }
            }

            return vsb.ToString();
        }

        /// <summary> Unescapes all metacharacters (including (,),[,],{,},|,^,$,*,+,?,\, spaces and #)</summary>
        public static string Unescape(string input)
        {
            int i = input.IndexOf('\\');
            return i >= 0 ?
                UnescapeImpl(input, i) :
                input;
        }

        private static unsafe string UnescapeImpl(string input, int i)
        {
            var parser = new RegexParser(input, RegexOptions.None, CultureInfo.InvariantCulture, new Hashtable(), 0, null, stackalloc int[OptionStackDefaultSize]);

            // In the worst case the escaped string has the same length.
            // For small inputs we use stack allocation.
            ValueStringBuilder vsb = input.Length <= EscapeMaxBufferSize ?
                new ValueStringBuilder(stackalloc char[EscapeMaxBufferSize]) :
                new ValueStringBuilder(input.Length);

            vsb.Append(input.AsSpan(0, i));
            do
            {
                i++;
                parser._pos = i;
                if (i < input.Length)
                {
                    vsb.Append(parser.ScanCharEscape());
                }

                i = parser._pos;
                int lastpos = i;
                while (i < input.Length && input[i] != '\\')
                {
                    i++;
                }

                vsb.Append(input.AsSpan(lastpos, i - lastpos));
            } while (i < input.Length);

            parser.Dispose();

            return vsb.ToString();
        }

        /// <summary>Resets parsing to the beginning of the pattern</summary>
        private void Reset(RegexOptions options)
        {
            _pos = 0;
            _autocap = 1;
            _ignoreNextParen = false;
            _optionsStack.Length = 0;
            _options = options;
            _stack = null;
        }

        public void Dispose() => _optionsStack.Dispose();

        /// <summary>The main parsing function</summary>
        private RegexNode ScanRegex()
        {
            char ch;
            bool isQuantifier = false;

            // For the main Capture object, strip out the IgnoreCase option. The rest of the nodes will strip it out depending on the content
            // of each node.
            StartGroup(new RegexNode(RegexNodeKind.Capture, (_options & ~RegexOptions.IgnoreCase), 0, -1));

            while (_pos < _pattern.Length)
            {
                bool wasPrevQuantifier = isQuantifier;
                isQuantifier = false;

                ScanBlank();

                int startpos = _pos;

                // move past all of the normal characters.  We'll stop when we hit some kind of control character,
                // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
                if ((_options & RegexOptions.IgnorePatternWhitespace) != 0)
                {
                    while (_pos < _pattern.Length && (!IsSpecialOrSpace(ch = _pattern[_pos]) || (ch == '{' && !IsTrueQuantifier())))
                        _pos++;
                }
                else
                {
                    while (_pos < _pattern.Length && (!IsSpecial(ch = _pattern[_pos]) || (ch == '{' && !IsTrueQuantifier())))
                        _pos++;
                }

                int endpos = _pos;

                ScanBlank();

                if (_pos == _pattern.Length)
                {
                    ch = '!'; // nonspecial, means at end
                }
                else if (IsSpecial(ch = _pattern[_pos]))
                {
                    isQuantifier = IsQuantifier(ch);
                    _pos++;
                }
                else
                {
                    ch = ' '; // nonspecial, means at ordinary char
                }

                if (startpos < endpos)
                {
                    int cchUnquantified = endpos - startpos - (isQuantifier ? 1 : 0);

                    wasPrevQuantifier = false;

                    if (cchUnquantified > 0)
                    {
                        AddToConcatenate(startpos, cchUnquantified, false);
                    }

                    if (isQuantifier)
                    {
                        _unit = RegexNode.CreateOneWithCaseConversion(_pattern[endpos - 1], _options, _culture, ref _caseBehavior);
                    }
                }

                switch (ch)
                {
                    case '!':
                        goto BreakOuterScan;

                    case ' ':
                        goto ContinueOuterScan;

                    case '[':
                        {
                            string setString = ScanCharClass((_options & RegexOptions.IgnoreCase) != 0, scanOnly: false)!.ToStringClass();
                            _unit = new RegexNode(RegexNodeKind.Set, _options & ~RegexOptions.IgnoreCase, setString);
                        }
                        break;

                    case '(':
                        _optionsStack.Append((int)_options);
                        if (ScanGroupOpen() is RegexNode grouper)
                        {
                            PushGroup();
                            StartGroup(grouper);
                        }
                        else
                        {
                            _optionsStack.Length--;
                        }
                        continue;

                    case '|':
                        AddAlternate();
                        goto ContinueOuterScan;

                    case ')':
                        if (_stack == null)
                        {
                            throw MakeException(RegexParseError.InsufficientOpeningParentheses, SR.InsufficientOpeningParentheses);
                        }

                        AddGroup();
                        PopGroup();
                        _options = (RegexOptions)_optionsStack.Pop();

                        if (_unit == null)
                        {
                            goto ContinueOuterScan;
                        }
                        break;

                    case '\\':
                        if (_pos == _pattern.Length)
                        {
                            throw MakeException(RegexParseError.UnescapedEndingBackslash, SR.UnescapedEndingBackslash);
                        }

                        _unit = ScanBackslash(scanOnly: false)!;
                        break;

                    case '^':
                        if ((_options & (RegexOptions.AnyNewLine | RegexOptions.Multiline)) == (RegexOptions.AnyNewLine | RegexOptions.Multiline))
                        {
                            _unit = AnyNewLineBolNode();
                        }
                        else
                        {
                            _unit = new RegexNode((_options & RegexOptions.Multiline) != 0 ? RegexNodeKind.Bol : RegexNodeKind.Beginning, _options);
                        }
                        break;

                    case '$':
                        if ((_options & RegexOptions.AnyNewLine) != 0)
                        {
                            _unit = (_options & RegexOptions.Multiline) != 0 ? AnyNewLineEolNode() : AnyNewLineEndZNode();
                        }
                        else
                        {
                            _unit = new RegexNode((_options & RegexOptions.Multiline) != 0 ? RegexNodeKind.Eol : RegexNodeKind.EndZ, _options);
                        }
                        break;

                    case '.':
                        if ((_options & RegexOptions.Singleline) != 0)
                        {
                            _unit = new RegexNode(RegexNodeKind.Set, _options & ~RegexOptions.IgnoreCase, RegexCharClass.AnyClass);
                        }
                        else if ((_options & RegexOptions.AnyNewLine) != 0)
                        {
                            _unit = new RegexNode(RegexNodeKind.Set, _options & ~RegexOptions.IgnoreCase, RegexCharClass.NotAnyNewLineClass);
                        }
                        else
                        {
                            _unit = new RegexNode(RegexNodeKind.Notone, _options & ~RegexOptions.IgnoreCase, '\n');
                        }
                        break;

                    case '{':
                    case '*':
                    case '+':
                    case '?':
                        if (_unit == null)
                        {
                            throw wasPrevQuantifier ?
                                MakeException(RegexParseError.NestedQuantifiersNotParenthesized, SR.Format(SR.NestedQuantifiersNotParenthesized, ch)) :
                                MakeException(RegexParseError.QuantifierAfterNothing, SR.Format(SR.QuantifierAfterNothing, ch));
                        }
                        --_pos;
                        break;

                    default:
                        Debug.Fail($"Unexpected char {ch}");
                        break;
                }

                ScanBlank();

                if (_pos == _pattern.Length || !(isQuantifier = IsTrueQuantifier()))
                {
                    _concatenation!.AddChild(_unit!);
                    _unit = null;
                    goto ContinueOuterScan;
                }

                ch = _pattern[_pos++];

                // Handle quantifiers
                while (_unit != null)
                {
                    int min = 0, max = 0;

                    switch (ch)
                    {
                        case '*':
                            max = int.MaxValue;
                            break;

                        case '?':
                            max = 1;
                            break;

                        case '+':
                            min = 1;
                            max = int.MaxValue;
                            break;

                        case '{':
                            startpos = _pos;
                            max = min = ScanDecimal();
                            if (startpos < _pos)
                            {
                                if (_pos < _pattern.Length && _pattern[_pos] == ',')
                                {
                                    _pos++;
                                    max = _pos == _pattern.Length || _pattern[_pos] == '}' ? int.MaxValue : ScanDecimal();
                                }
                            }

                            if (startpos == _pos || _pos == _pattern.Length || _pattern[_pos++] != '}')
                            {
                                _concatenation!.AddChild(_unit!);
                                _unit = null;
                                _pos = startpos - 1;
                                goto ContinueOuterScan;
                            }

                            break;

                        default:
                            Debug.Fail($"Unexpected char {ch}");
                            break;
                    }

                    ScanBlank();

                    bool lazy = false;
                    if (_pos < _pattern.Length && _pattern[_pos] == '?')
                    {
                        _pos++;
                        lazy = true;
                    }

                    if (min > max)
                    {
                        throw MakeException(RegexParseError.ReversedQuantifierRange, SR.ReversedQuantifierRange);
                    }

                    _concatenation!.AddChild(_unit!.MakeQuantifier(lazy, min, max));
                    _unit = null;
                }

            ContinueOuterScan:
                ;
            }

        BreakOuterScan:
            ;

            if (_stack != null)
            {
                throw MakeException(RegexParseError.InsufficientClosingParentheses, SR.InsufficientClosingParentheses);
            }

            AddGroup();

            return _unit!.FinalOptimize();
        }

        /// <summary>Simple parsing for replacement patterns</summary>
        private RegexNode ScanReplacement()
        {
            _concatenation = new RegexNode(RegexNodeKind.Concatenate, _options);

            while (_pos < _pattern.Length)
            {
                int startpos = _pos;

                _pos = _pattern.IndexOf('$', _pos);
                if (_pos < 0)
                {
                    _pos = _pattern.Length;
                }

                AddToConcatenate(startpos, _pos - startpos, isReplacement: true);

                if (_pos < _pattern.Length)
                {
                    _pos++;
                    _concatenation.AddChild(ScanDollar());
                    _unit = null;
                }
            }

            return _concatenation;
        }

        /// <summary>Scans contents of [] (not including []'s), and converts to a RegexCharClass.</summary>
        /// <remarks>
        /// Character class subtractions (e.g. [a-z-[aeiou]]) are handled iteratively using an
        /// explicit parent stack to avoid stack overflow with deeply nested subtractions.
        /// </remarks>
        private RegexCharClass? ScanCharClass(bool caseInsensitive, bool scanOnly)
        {
            char ch;
            char chPrev = '\0';
            bool inRange = false;
            bool firstChar = true;
            bool closed = false;
            bool startingNewLevel = false;

            List<RegexCharClass?>? parents = null;

            RegexCharClass? charClass = scanOnly ? null : new RegexCharClass();

            if (_pos < _pattern.Length && _pattern[_pos] == '^')
            {
                _pos++;
                if (!scanOnly)
                {
                    charClass!.Negate = true;
                }
                if ((_options & RegexOptions.ECMAScript) != 0 && _pattern[_pos] == ']')
                {
                    firstChar = false;
                }
            }

            for (; _pos < _pattern.Length; firstChar = false)
            {
                // When entering a new subtraction level, reset state for the nested character class.
                if (startingNewLevel)
                {
                    startingNewLevel = false;
                    firstChar = true;
                    if (_pos < _pattern.Length && _pattern[_pos] == '^')
                    {
                        _pos++;
                        if (!scanOnly)
                        {
                            charClass!.Negate = true;
                        }
                        if ((_options & RegexOptions.ECMAScript) != 0 && _pos < _pattern.Length && _pattern[_pos] == ']')
                        {
                            firstChar = false;
                        }
                    }
                    if (_pos >= _pattern.Length)
                    {
                        break;
                    }
                }

                bool translatedChar = false;
                ch = _pattern[_pos++];
                if (ch == ']')
                {
                    if (!firstChar)
                    {
                        // Finalize this character class level.
                        if (!scanOnly && caseInsensitive)
                        {
                            charClass!.AddCaseEquivalences(_culture);
                        }

                        // If there are parent levels, pop back to the parent and set the
                        // current class as its subtraction.
                        if (parents is { Count: > 0 })
                        {
                            RegexCharClass? parent = parents[parents.Count - 1];
                            parents.RemoveAt(parents.Count - 1);
                            if (!scanOnly)
                            {
                                parent!.AddSubtraction(charClass!);

                                if (_pos < _pattern.Length && _pattern[_pos] != ']')
                                {
                                    throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
                                }
                            }
                            charClass = parent;
                            continue;
                        }

                        closed = true;
                        break;
                    }
                }
                else if (ch == '\\' && _pos < _pattern.Length)
                {
                    switch (ch = _pattern[_pos++])
                    {
                        case 'D':
                        case 'd':
                            if (!scanOnly)
                            {
                                if (inRange)
                                {
                                    throw MakeException(RegexParseError.ShorthandClassInCharacterRange, SR.Format(SR.ShorthandClassInCharacterRange, ch));
                                }
                                charClass!.AddDigit((_options & RegexOptions.ECMAScript) != 0, ch == 'D', _pattern, _pos);
                            }
                            continue;

                        case 'S':
                        case 's':
                            if (!scanOnly)
                            {
                                if (inRange)
                                {
                                    throw MakeException(RegexParseError.ShorthandClassInCharacterRange, SR.Format(SR.ShorthandClassInCharacterRange, ch));
                                }
                                charClass!.AddSpace((_options & RegexOptions.ECMAScript) != 0, ch == 'S');
                            }
                            continue;

                        case 'W':
                        case 'w':
                            if (!scanOnly)
                            {
                                if (inRange)
                                {
                                    throw MakeException(RegexParseError.ShorthandClassInCharacterRange, SR.Format(SR.ShorthandClassInCharacterRange, ch));
                                }

                                charClass!.AddWord((_options & RegexOptions.ECMAScript) != 0, ch == 'W');
                            }
                            continue;

                        case 'p':
                        case 'P':
                            if (!scanOnly)
                            {
                                if (inRange)
                                {
                                    throw MakeException(RegexParseError.ShorthandClassInCharacterRange, SR.Format(SR.ShorthandClassInCharacterRange, ch));
                                }

                                charClass!.AddCategoryFromName(ParseProperty(), ch != 'p', caseInsensitive, _pattern, _pos);
                            }
                            else
                            {
                                ParseProperty();
                            }
                            continue;

                        case '-':
                            if (!scanOnly)
                            {
                                if (inRange)
                                {
                                    if (chPrev > ch)
                                    {
                                        throw MakeException(RegexParseError.ReversedCharacterRange, SR.ReversedCharacterRange);
                                    }

                                    charClass!.AddRange(chPrev, ch);
                                    inRange = false;
                                    chPrev = '\0';
                                }
                                else
                                {
                                    charClass!.AddRange(ch, ch);
                                }
                            }
                            continue;

                        default:
                            --_pos;
                            ch = ScanCharEscape(); // non-literal character
                            translatedChar = true;
                            break; // this break will only break out of the switch
                    }
                }

                if (inRange)
                {
                    inRange = false;
                    if (!scanOnly)
                    {
                        if (ch == '[' && !translatedChar && !firstChar)
                        {
                            // We thought we were in a range, but we're actually starting a subtraction.
                            // In that case, we'll add chPrev to our char class, skip the opening [, and
                            // scan the new character class iteratively.
                            charClass!.AddChar(chPrev);
                            (parents ??= new List<RegexCharClass?>()).Add(charClass);
                            charClass = new RegexCharClass();
                            chPrev = '\0';
                            startingNewLevel = true;
                            continue;
                        }
                        else
                        {
                            // a regular range, like a-z
                            if (chPrev > ch)
                            {
                                throw MakeException(RegexParseError.ReversedCharacterRange, SR.ReversedCharacterRange);
                            }
                            charClass!.AddRange(chPrev, ch);
                        }
                    }
                }
                else if (_pos + 1 < _pattern.Length && _pattern[_pos] == '-' && _pattern[_pos + 1] != ']')
                {
                    // this could be the start of a range
                    chPrev = ch;
                    inRange = true;
                    _pos++;
                }
                else if (_pos < _pattern.Length && ch == '-' && !translatedChar && _pattern[_pos] == '[' && !firstChar)
                {
                    // we aren't in a range, and now there is a subtraction.  Usually this happens
                    // only when a subtraction follows a range, like [a-z-[b]]
                    _pos++;
                    (parents ??= new List<RegexCharClass?>()).Add(scanOnly ? null : charClass);
                    if (!scanOnly)
                    {
                        charClass = new RegexCharClass();
                    }
                    chPrev = '\0';
                    startingNewLevel = true;
                    continue;
                }
                else
                {
                    if (!scanOnly)
                    {
                        charClass!.AddRange(ch, ch);
                    }
                }
            }

            if (!closed)
            {
                throw MakeException(RegexParseError.UnterminatedBracket, SR.UnterminatedBracket);
            }

            return charClass;
        }

        /// <summary>
        /// Scans chars following a '(' (not counting the '('), and returns
        /// a RegexNode for the type of group scanned, or null if the group
        /// simply changed options (?cimsx-cimsx) or was a comment (#...).
        /// </summary>
        private RegexNode? ScanGroupOpen()
        {
            // just return a RegexNode if we have:
            // 1. "(" followed by nothing
            // 2. "(x" where x != ?
            // 3. "(?)"
            if (_pos == _pattern.Length || _pattern[_pos] != '?' || (_pos + 1 < _pattern.Length && _pattern[_pos + 1] == ')'))
            {
                if ((_options & RegexOptions.ExplicitCapture) != 0 || _ignoreNextParen)
                {
                    _ignoreNextParen = false;
                    return new RegexNode(RegexNodeKind.Group, _options);
                }
                else
                {
                    return new RegexNode(RegexNodeKind.Capture, _options, _autocap++, -1);
                }
            }

            _pos++;

            while (true)
            {
                if (_pos == _pattern.Length)
                {
                    break;
                }

                RegexNodeKind nodeType;
                char close = '>';
                char ch = _pattern[_pos++];
                switch (ch)
                {
                    case ':':
                        // noncapturing group
                        nodeType = RegexNodeKind.Group;
                        break;

                    case '=':
                        // lookahead assertion
                        _options &= ~RegexOptions.RightToLeft;
                        nodeType = RegexNodeKind.PositiveLookaround;
                        break;

                    case '!':
                        // negative lookahead assertion
                        _options &= ~RegexOptions.RightToLeft;
                        nodeType = RegexNodeKind.NegativeLookaround;
                        break;

                    case '>':
                        // atomic subexpression
                        nodeType = RegexNodeKind.Atomic;
                        break;

                    case '\'':
                        close = '\'';
                        goto case '<'; // fallthrough

                    case '<':
                        if (_pos == _pattern.Length)
                        {
                            goto BreakRecognize;
                        }

                        switch (ch = _pattern[_pos++])
                        {
                            case '=':
                                if (close == '\'')
                                {
                                    goto BreakRecognize;
                                }

                                // lookbehind assertion
                                _options |= RegexOptions.RightToLeft;
                                nodeType = RegexNodeKind.PositiveLookaround;
                                break;

                            case '!':
                                if (close == '\'')
                                {
                                    goto BreakRecognize;
                                }

                                // negative lookbehind assertion
                                _options |= RegexOptions.RightToLeft;
                                nodeType = RegexNodeKind.NegativeLookaround;
                                break;

                            default:
                                --_pos;
                                int capnum = -1;
                                int uncapnum = -1;
                                bool proceed = false;

                                // grab part before -

                                if ((uint)(ch - '0') <= 9)
                                {
                                    capnum = ScanDecimal();

                                    if (!IsCaptureSlot(capnum))
                                    {
                                        capnum = -1;
                                    }

                                    // check if we have bogus characters after the number
                                    if (_pos < _pattern.Length && !(_pattern[_pos] == close || _pattern[_pos] == '-'))
                                    {
                                        throw MakeException(RegexParseError.CaptureGroupNameInvalid, SR.CaptureGroupNameInvalid);
                                    }

                                    if (capnum == 0)
                                    {
                                        throw MakeException(RegexParseError.CaptureGroupOfZero, SR.CaptureGroupOfZero);
                                    }
                                }
                                else if (RegexCharClass.IsBoundaryWordChar(ch))
                                {
                                    string capname = ScanCapname();

                                    if (_capnames?[capname] is int tmpCapnum)
                                    {
                                        capnum = tmpCapnum;
                                    }

                                    // check if we have bogus character after the name
                                    if (_pos < _pattern.Length && !(_pattern[_pos] == close || _pattern[_pos] == '-'))
                                    {
                                        throw MakeException(RegexParseError.CaptureGroupNameInvalid, SR.CaptureGroupNameInvalid);
                                    }
                                }
                                else if (ch == '-')
                                {
                                    proceed = true;
                                }
                                else
                                {
                                    // bad group name - starts with something other than a word character and isn't a number
                                    throw MakeException(RegexParseError.CaptureGroupNameInvalid, SR.CaptureGroupNameInvalid);
                                }

                                // grab part after - if any

                                if ((capnum != -1 || proceed) && _pos + 1 < _pattern.Length && _pattern[_pos] == '-')
                                {
                                    _pos++;
                                    ch = _pattern[_pos];

                                    if ((uint)(ch - '0') <= 9)
                                    {
                                        uncapnum = ScanDecimal();

                                        if (!IsCaptureSlot(uncapnum))
                                        {
                                            throw MakeException(RegexParseError.UndefinedNumberedReference, SR.Format(SR.UndefinedNumberedReference, uncapnum));
                                        }

                                        // check if we have bogus characters after the number
                                        if (_pos < _pattern.Length && _pattern[_pos] != close)
                                        {
                                            throw MakeException(RegexParseError.CaptureGroupNameInvalid, SR.CaptureGroupNameInvalid);
                                        }
                                    }
                                    else if (RegexCharClass.IsBoundaryWordChar(ch))
                                    {
                                        string uncapname = ScanCapname();

                                        uncapnum = _capnames?[uncapname] is int tmpCapnum ?
                                            tmpCapnum :
                                            throw MakeException(RegexParseError.UndefinedNamedReference, SR.Format(SR.UndefinedNamedReference, uncapname));

                                        // check if we have bogus character after the name
                                        if (_pos < _pattern.Length && _pattern[_pos] != close)
                                        {
                                            throw MakeException(RegexParseError.CaptureGroupNameInvalid, SR.CaptureGroupNameInvalid);
                                        }
                                    }
                                    else
                                    {
                                        // bad group name - starts with something other than a word character and isn't a number
                                        throw MakeException(RegexParseError.CaptureGroupNameInvalid, SR.CaptureGroupNameInvalid);
                                    }
                                }

                                // actually make the node

                                if ((capnum != -1 || uncapnum != -1) && _pos < _pattern.Length && _pattern[_pos++] == close)
                                {
                                    return new RegexNode(RegexNodeKind.Capture, _options, capnum, uncapnum);
                                }
                                goto BreakRecognize;
                        }
                        break;

                    case '(':
                        // conditional alternation construct (?(...) | )

                        int parenPos = _pos;
                        if (_pos < _pattern.Length)
                        {
                            ch = _pattern[_pos];

                            // check if the alternation condition is a backref
                            if (ch is >= '0' and <= '9')
                            {
                                int capnum = ScanDecimal();
                                if (_pos < _pattern.Length && _pattern[_pos++] == ')')
                                {
                                    if (IsCaptureSlot(capnum))
                                    {
                                        return new RegexNode(RegexNodeKind.BackreferenceConditional, _options, capnum);
                                    }

                                    throw MakeException(RegexParseError.AlternationHasUndefinedReference, SR.Format(SR.AlternationHasUndefinedReference, capnum.ToString()));
                                }

                                throw MakeException(RegexParseError.AlternationHasMalformedReference, SR.Format(SR.AlternationHasMalformedReference, capnum.ToString()));
                            }
                            else if (RegexCharClass.IsBoundaryWordChar(ch))
                            {
                                string capname = ScanCapname();

                                if (_capnames?[capname] is int tmpCapnum && _pos < _pattern.Length && _pattern[_pos++] == ')')
                                {
                                    return new RegexNode(RegexNodeKind.BackreferenceConditional, _options, tmpCapnum);
                                }
                            }
                        }
                        // not a backref
                        nodeType = RegexNodeKind.ExpressionConditional;
                        _pos = parenPos - 1;       // jump to the start of the parentheses
                        _ignoreNextParen = true;    // but make sure we don't try to capture the insides

                        if (_pos + 2 < _pattern.Length && _pattern[_pos + 1] == '?')
                        {
                            // disallow comments in the condition
                            if (_pattern[_pos + 2] == '#')
                            {
                                throw MakeException(RegexParseError.AlternationHasComment, SR.AlternationHasComment);
                            }

                            // disallow named capture group (?<..>..) in the condition
                            if (_pattern[_pos + 2] == '\'' || (_pos + 3 < _pattern.Length && _pattern[_pos + 2] == '<' && _pattern[_pos + 3] != '!' && _pattern[_pos + 3] != '='))
                            {
                                throw MakeException(RegexParseError.AlternationHasNamedCapture, SR.AlternationHasNamedCapture);
                            }
                        }

                        break;

                    default:
                        --_pos;

                        nodeType = RegexNodeKind.Group;
                        // While parsing the test of a conditional (ExpressionConditional with no children yet),
                        // disallow inline options; allow them once the test has been parsed and in the yes/no branches.
                        if (_group!.Kind != RegexNodeKind.ExpressionConditional || _group.ChildCount() > 0)
                        {
                            ScanOptions();
                        }

                        if (_pos == _pattern.Length)
                        {
                            goto BreakRecognize;
                        }

                        if ((ch = _pattern[_pos++]) == ')')
                        {
                            return null;
                        }

                        if (ch != ':')
                        {
                            goto BreakRecognize;
                        }
                        break;
                }

                return new RegexNode(nodeType, _options);
            }

        BreakRecognize:
            ;
            // break Recognize comes here

            throw MakeException(RegexParseError.InvalidGroupingConstruct, SR.InvalidGroupingConstruct);
        }

        /// <summary>Scans whitespace or x-mode comments</summary>
        private void ScanBlank()
        {
            while (true)
            {
                if ((_options & RegexOptions.IgnorePatternWhitespace) != 0)
                {
                    while (_pos < _pattern.Length && IsSpace(_pattern[_pos]))
                    {
                        _pos++;
                    }
                }

                if ((_options & RegexOptions.IgnorePatternWhitespace) != 0 && _pos < _pattern.Length && _pattern[_pos] == '#')
                {
                    _pos = _pattern.IndexOf('\n', _pos);
                    if (_pos < 0)
                    {
                        _pos = _pattern.Length;
                    }
                }
                else if (_pos + 2 < _pattern.Length && _pattern[_pos + 2] == '#' && _pattern[_pos + 1] == '?' && _pattern[_pos] == '(')
                {
                    _pos = _pattern.IndexOf(')', _pos);
                    if (_pos < 0)
                    {
                        _pos = _pattern.Length;
                        throw MakeException(RegexParseError.UnterminatedComment, SR.UnterminatedComment);
                    }

                    _pos++;
                }
                else
                {
                    break;
                }
            }
        }

        /// <summary>Scans chars following a '\' (not counting the '\'), and returns a RegexNode for the type of atom scanned</summary>
        private RegexNode? ScanBackslash(bool scanOnly)
        {
            Debug.Assert(_pos < _pattern.Length, "The current reading position must not be at the end of the pattern");

            char ch;
            switch (ch = _pattern[_pos])
            {
                case 'b':
                case 'B':
                case 'A':
                case 'G':
                case 'z':
                    _pos++;
                    return scanOnly ? null :
                        new RegexNode(TypeFromCode(ch), _options);

                case 'Z':
                    _pos++;
                    return scanOnly ? null :
                        (_options & RegexOptions.AnyNewLine) != 0 ? AnyNewLineEndZNode() : new RegexNode(RegexNodeKind.EndZ, _options);

                case 'w':
                    _pos++;
                    return scanOnly ? null :
                        new RegexNode(RegexNodeKind.Set, (_options & ~RegexOptions.IgnoreCase), (_options & RegexOptions.ECMAScript) != 0 ? RegexCharClass.ECMAWordClass : RegexCharClass.WordClass);

                case 'W':
                    _pos++;
                    return scanOnly ? null :
                        new RegexNode(RegexNodeKind.Set, (_options & ~RegexOptions.IgnoreCase), (_options & RegexOptions.ECMAScript) != 0 ? RegexCharClass.NotECMAWordClass : RegexCharClass.NotWordClass);

                case 's':
                    _pos++;
                    return scanOnly ? null :
                        new RegexNode(RegexNodeKind.Set, (_options & ~RegexOptions.IgnoreCase), (_options & RegexOptions.ECMAScript) != 0 ? RegexCharClass.ECMASpaceClass : RegexCharClass.SpaceClass);

                case 'S':
                    _pos++;
                    return scanOnly ? null :
                        new RegexNode(RegexNodeKind.Set, (_options & ~RegexOptions.IgnoreCase), (_options & RegexOptions.ECMAScript) != 0 ? RegexCharClass.NotECMASpaceClass : RegexCharClass.NotSpaceClass);

                case 'd':
                    _pos++;
                    return scanOnly ? null :
                        new RegexNode(RegexNodeKind.Set, (_options & ~RegexOptions.IgnoreCase), (_options & RegexOptions.ECMAScript) != 0 ? RegexCharClass.ECMADigitClass : RegexCharClass.DigitClass);

                case 'D':
                    _pos++;
                    return scanOnly ? null :
                        new RegexNode(RegexNodeKind.Set, (_options & ~RegexOptions.IgnoreCase), (_options & RegexOptions.ECMAScript) != 0 ? RegexCharClass.NotECMADigitClass : RegexCharClass.NotDigitClass);

                case 'p':
                case 'P':
                    _pos++;
                    if (scanOnly)
                    {
                        return null;
                    }

                    var cc = new RegexCharClass();
                    cc.AddCategoryFromName(ParseProperty(), ch != 'p', (_options & RegexOptions.IgnoreCase) != 0, _pattern, _pos);
                    if ((_options & RegexOptions.IgnoreCase) != 0)
                    {
                        cc.AddCaseEquivalences(_culture);
                    }

                    return new RegexNode(RegexNodeKind.Set, (_options & ~RegexOptions.IgnoreCase), cc.ToStringClass());

                default:
                    RegexNode? result = ScanBasicBackslash(scanOnly);
                    if (result != null && result.Kind == RegexNodeKind.Backreference && (result.Options & RegexOptions.IgnoreCase) != 0)
                    {
                        _hasIgnoreCaseBackreferenceNodes = true;
                    }
                    return result;
            }
        }

        /// <summary>Scans \-style backreferences and character escapes</summary>
        private RegexNode? ScanBasicBackslash(bool scanOnly)
        {
            Debug.Assert(_pos < _pattern.Length, "The current reading position must not be at the end of the pattern");

            int backpos = _pos;
            char close = '\0';
            bool angled = false;
            char ch = _pattern[_pos];

            // allow \k<foo> instead of \<foo>, which is now deprecated

            if (ch == 'k')
            {
                if (_pos + 1 < _pattern.Length)
                {
                    _pos++;
                    ch = _pattern[_pos++];
                    if (ch is '<' or '\'')
                    {
                        angled = true;
                        close = (ch == '\'') ? '\'' : '>';
                    }
                }

                if (!angled || _pos == _pattern.Length)
                {
                    throw MakeException(RegexParseError.MalformedNamedReference, SR.MalformedNamedReference);
                }

                ch = _pattern[_pos];
            }

            // Note angle without \g

            else if ((ch == '<' || ch == '\'') && _pos + 1 < _pattern.Length)
            {
                angled = true;
                close = (ch == '\'') ? '\'' : '>';
                _pos++;
                ch = _pattern[_pos];
            }

            // Try to parse backreference: \<1>

            if (angled && ch >= '0' && ch <= '9')
            {
                int capnum = ScanDecimal();

                if (_pos < _pattern.Length && _pattern[_pos++] == close)
                {
                    return
                        scanOnly ? null :
                        IsCaptureSlot(capnum) ? new RegexNode(RegexNodeKind.Backreference, _options, capnum) :
                        throw MakeException(RegexParseError.UndefinedNumberedReference, SR.Format(SR.UndefinedNumberedReference, capnum.ToString()));
                }
            }

            // Try to parse backreference or octal: \1

            else if (!angled && ch >= '1' && ch <= '9')
            {
                if ((_options & RegexOptions.ECMAScript) != 0)
                {
                    int capnum = -1;
                    int newcapnum = ch - '0';
                    int pos = _pos - 1;
                    while (newcapnum <= _captop)
                    {
                        if (IsCaptureSlot(newcapnum) && (_caps == null || (int)_caps[newcapnum]! < pos))
                        {
                            capnum = newcapnum;
                        }

                        _pos++;
                        if (_pos == _pattern.Length || (ch = _pattern[_pos]) < '0' || ch > '9')
                        {
                            break;
                        }

                        newcapnum = newcapnum * 10 + (ch - '0');
                    }

                    if (capnum >= 0)
                    {
                        return scanOnly ? null : new RegexNode(RegexNodeKind.Backreference, _options, capnum);
                    }
                }
                else
                {
                    int capnum = ScanDecimal();

                    if (scanOnly)
                    {
                        return null;
                    }

                    if (IsCaptureSlot(capnum))
                    {
                        return new RegexNode(RegexNodeKind.Backreference, _options, capnum);
                    }

                    if (capnum <= 9)
                    {
                        throw MakeException(RegexParseError.UndefinedNumberedReference, SR.Format(SR.UndefinedNumberedReference, capnum.ToString()));
                    }
                }
            }

            // Try to parse backreference: \<foo>

            else if (angled && RegexCharClass.IsBoundaryWordChar(ch))
            {
                string capname = ScanCapname();

                if (_pos < _pattern.Length && _pattern[_pos++] == close)
                {
                    return
                        scanOnly ? null :
                        _capnames?[capname] is int tmpCapnum ? new RegexNode(RegexNodeKind.Backreference, _options, tmpCapnum) :
                        throw MakeException(RegexParseError.UndefinedNamedReference, SR.Format(SR.UndefinedNamedReference, capname));
                }
            }

            // Not backreference: must be char code

            _pos = backpos;
            ch = ScanCharEscape();

            return !scanOnly ?
                RegexNode.CreateOneWithCaseConversion(ch, _options, _culture, ref _caseBehavior) :
                null;
        }

        /// <summary>Scans $ patterns recognized within replacement patterns</summary>
        private RegexNode ScanDollar()
        {
            if (_pos == _pattern.Length)
            {
                return RegexNode.CreateOneWithCaseConversion('$', _options, _culture, ref _caseBehavior);
            }

            char ch = _pattern[_pos];
            bool angled;
            int backpos = _pos;
            int lastEndPos = backpos;

            // Note angle

            if (ch == '{' && _pos + 1 < _pattern.Length)
            {
                angled = true;
                _pos++;
                ch = _pattern[_pos];
            }
            else
            {
                angled = false;
            }

            // Try to parse backreference: \1 or \{1} or \{cap}

            if (ch is >= '0' and <= '9')
            {
                if (!angled && (_options & RegexOptions.ECMAScript) != 0)
                {
                    int capnum = -1;
                    int newcapnum = ch - '0';
                    _pos++;
                    if (IsCaptureSlot(newcapnum))
                    {
                        capnum = newcapnum;
                        lastEndPos = _pos;
                    }

                    while (_pos < _pattern.Length && (ch = _pattern[_pos]) >= '0' && ch <= '9')
                    {
                        int digit = ch - '0';
                        if (newcapnum > MaxValueDiv10 || (newcapnum == MaxValueDiv10 && digit > MaxValueMod10))
                        {
                            throw MakeException(RegexParseError.QuantifierOrCaptureGroupOutOfRange, SR.QuantifierOrCaptureGroupOutOfRange);
                        }

                        newcapnum = newcapnum * 10 + digit;

                        _pos++;
                        if (IsCaptureSlot(newcapnum))
                        {
                            capnum = newcapnum;
                            lastEndPos = _pos;
                        }
                    }
                    _pos = lastEndPos;
                    if (capnum >= 0)
                    {
                        return new RegexNode(RegexNodeKind.Backreference, _options, capnum);
                    }
                }
                else
                {
                    int capnum = ScanDecimal();
                    if (!angled || _pos < _pattern.Length && _pattern[_pos++] == '}')
                    {
                        if (IsCaptureSlot(capnum))
                        {
                            return new RegexNode(RegexNodeKind.Backreference, _options, capnum);
                        }
                    }
                }
            }
            else if (angled && RegexCharClass.IsBoundaryWordChar(ch))
            {
                string capname = ScanCapname();
                if (_pos < _pattern.Length && _pattern[_pos++] == '}')
                {
                    if (_capnames?[capname] is int tmpCapnum)
                    {
                        return new RegexNode(RegexNodeKind.Backreference, _options, tmpCapnum);
                    }
                }
            }
            else if (!angled)
            {
                int capnum = 1;

                switch (ch)
                {
                    case '$':
                        _pos++;
                        return RegexNode.CreateOneWithCaseConversion('$', _options, _culture, ref _caseBehavior);

                    case '&':
                        capnum = 0;
                        break;

                    case '`':
                        capnum = RegexReplacement.LeftPortion;
                        break;

                    case '\'':
                        capnum = RegexReplacement.RightPortion;
                        break;

                    case '+':
                        capnum = RegexReplacement.LastGroup;
                        break;

                    case '_':
                        capnum = RegexReplacement.WholeString;
                        break;
                }

                if (capnum != 1)
                {
                    _pos++;
                    return new RegexNode(RegexNodeKind.Backreference, _options, capnum);
                }
            }

            // unrecognized $: literalize

            _pos = backpos;
            return RegexNode.CreateOneWithCaseConversion('$', _options, _culture, ref _caseBehavior);
        }

        /// <summary>Scans a capture name: consumes word chars</summary>
        private string ScanCapname()
        {
            int startpos = _pos;

            while (_pos < _pattern.Length)
            {
                if (!RegexCharClass.IsBoundaryWordChar(_pattern[_pos++]))
                {
                    --_pos;
                    break;
                }
            }

            return _pattern.Substring(startpos, _pos - startpos);
        }

        /// <summary>Scans up to three octal digits (stops before exceeding 0377)</summary>
        private char ScanOctal()
        {
            // Consume octal chars only up to 3 digits and value 0377
            int c = Math.Min(3, _pattern.Length - _pos);
            int d;
            int i;
            for (i = 0; c > 0 && (uint)(d = _pattern[_pos] - '0') <= 7; c -= 1)
            {
                _pos++;
                i = (i * 8) + d;
                if ((_options & RegexOptions.ECMAScript) != 0 && i >= 0x20)
                {
                    break;
                }
            }

            // Octal codes only go up to 255.  Any larger and the behavior that Perl follows
            // is simply to truncate the high bits.
            i &= 0xFF;

            return (char)i;
        }

        /// <summary>Scans any number of decimal digits (pegs value at 2^31-1 if too large)</summary>
        private int ScanDecimal()
        {
            int i = 0;
            int d;

            while (_pos < _pattern.Length && (uint)(d = (char)(_pattern[_pos] - '0')) <= 9)
            {
                _pos++;

                if (i > MaxValueDiv10 || (i == MaxValueDiv10 && d > MaxValueMod10))
                {
                    throw MakeException(RegexParseError.QuantifierOrCaptureGroupOutOfRange, SR.QuantifierOrCaptureGroupOutOfRange);
                }

                i = (i * 10) + d;
            }

            return i;
        }

        /// <summary>Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)</summary>
        private char ScanHex(int c)
        {
            int i = 0;

            if (_pos + c <= _pattern.Length)
            {
                for (; c > 0; c -= 1)
                {
                    char ch = _pattern[_pos++];
                    int result = HexConverter.FromChar(ch);
                    if (result == 0xFF)
                    {
                        break;
                    }

                    i = (i * 0x10) + result;
                }
            }

            if (c > 0)
            {
                throw MakeException(RegexParseError.InsufficientOrInvalidHexDigits, SR.InsufficientOrInvalidHexDigits);
            }

            return (char)i;
        }

        /// <summary>Grabs and converts an ASCII control character</summary>
        private char ScanControl()
        {
            if (_pos == _pattern.Length)
            {
                throw MakeException(RegexParseError.MissingControlCharacter, SR.MissingControlCharacter);
            }

            char ch = _pattern[_pos++];

            // \ca interpreted as \cA

            if ((uint)(ch - 'a') <= 'z' - 'a')
            {
                ch = (char)(ch - ('a' - 'A'));
            }

            if ((ch = (char)(ch - '@')) < ' ')
            {
                return ch;
            }

            throw MakeException(RegexParseError.UnrecognizedControlCharacter, SR.UnrecognizedControlCharacter);
        }

        /// <summary>Scans cimsx-cimsx option string, stops at the first unrecognized char.</summary>
        private void ScanOptions()
        {
            for (bool off = false; _pos < _pattern.Length; _pos++)
            {
                char ch = _pattern[_pos];

                if (ch == '-')
                {
                    off = true;
                }
                else if (ch == '+')
                {
                    off = false;
                }
                else
                {
                    RegexOptions options = (char)(ch | 0x20) switch
                    {
                        'i' => RegexOptions.IgnoreCase,
                        'm' => RegexOptions.Multiline,
                        'n' => RegexOptions.ExplicitCapture,
                        's' => RegexOptions.Singleline,
                        'x' => RegexOptions.IgnorePatternWhitespace,
                        _ => RegexOptions.None,
                    };
                    if (options == 0)
                    {
                        return;
                    }

                    if (off)
                    {
                        _options &= ~options;
                    }
                    else
                    {
                        _options |= options;
                    }
                }
            }
        }

        /// <summary>Scans \ code for escape codes that map to single Unicode chars.</summary>
        private char ScanCharEscape()
        {
            char ch = _pattern[_pos++];

            if (ch is >= '0' and <= '7')
            {
                --_pos;
                return ScanOctal();
            }

            switch (ch)
            {
                case 'x':
                    return ScanHex(2);
                case 'u':
                    return ScanHex(4);
                case 'a':
                    return '\u0007';
                case 'b':
                    return '\b';
                case 'e':
                    return '\e';
                case 'f':
                    return '\f';
                case 'n':
                    return '\n';
                case 'r':
                    return '\r';
                case 't':
                    return '\t';
                case 'v':
                    return '\u000B';
                case 'c':
                    return ScanControl();
                default:
                    if ((_options & RegexOptions.ECMAScript) == 0 && RegexCharClass.IsBoundaryWordChar(ch))
                    {
                        throw MakeException(RegexParseError.UnrecognizedEscape, SR.Format(SR.UnrecognizedEscape, ch));
                    }
                    return ch;
            }
        }

        /// <summary>Scans X for \p{X} or \P{X}</summary>
        private string ParseProperty()
        {
            if (_pos + 2 >= _pattern.Length)
            {
                throw MakeException(RegexParseError.InvalidUnicodePropertyEscape, SR.InvalidUnicodePropertyEscape);
            }

            char ch = _pattern[_pos++];
            if (ch != '{')
            {
                throw MakeException(RegexParseError.MalformedUnicodePropertyEscape, SR.MalformedUnicodePropertyEscape);
            }

            int startpos = _pos;
            while (_pos < _pattern.Length)
            {
                ch = _pattern[_pos++];
                if (!(RegexCharClass.IsBoundaryWordChar(ch) || ch == '-'))
                {
                    --_pos;
                    break;
                }
            }

            string capname = _pattern.Substring(startpos, _pos - startpos);

            if (_pos == _pattern.Length || _pattern[_pos++] != '}')
            {
                throw MakeException(RegexParseError.InvalidUnicodePropertyEscape, SR.InvalidUnicodePropertyEscape);
            }

            return capname;
        }

        /// <summary>
        /// Creates (?!(?&lt;=\r)\n) — a negative lookahead that blocks matching
        /// between \r and \n of a \r\n sequence. Used as a guard by all
        /// AnyNewLine anchor lowerings to prevent anchors from firing
        /// at the \r\n boundary.
        /// </summary>
        private RegexNode AnyNewLineCrLfGuardNode()
        {
            RegexOptions lookaheadOpts = _options & ~RegexOptions.RightToLeft;
            RegexOptions lookbehindOpts = _options | RegexOptions.RightToLeft;
            RegexOptions lookaheadOptsNoCase = lookaheadOpts & ~RegexOptions.IgnoreCase;
            RegexOptions lookbehindOptsNoCase = lookbehindOpts & ~RegexOptions.IgnoreCase;

            // (?<=\r)
            var lookbehindCr = new RegexNode(RegexNodeKind.PositiveLookaround, lookbehindOpts);
            lookbehindCr.AddChild(new RegexNode(RegexNodeKind.One, lookbehindOptsNoCase, '\r'));

            // (?<=\r)\n
            var crThenLf = new RegexNode(RegexNodeKind.Concatenate, lookaheadOpts);
            crThenLf.AddChild(lookbehindCr);
            crThenLf.AddChild(new RegexNode(RegexNodeKind.One, lookaheadOptsNoCase, '\n'));

            // (?!(?<=\r)\n)
            var guard = new RegexNode(RegexNodeKind.NegativeLookaround, lookaheadOpts);
            guard.AddChild(crThenLf);

            return guard;
        }

        /// <summary>
        /// Builds a tree equivalent to $ or \Z with AnyNewLine (non-Multiline).
        /// Matches at end of string, or before any newline at end of string,
        /// but not between \r and \n.
        /// Equivalent to: (?=\r\n\z|[\n\r\v\f\u0085\u2028\u2029]?\z)(?!(?&lt;=\r)\n)
        /// </summary>
        private RegexNode AnyNewLineEndZNode()
        {
            RegexOptions opts = _options;
            RegexOptions lookaheadOpts = opts & ~RegexOptions.RightToLeft;
            RegexOptions lookaheadOptsNoCase = lookaheadOpts & ~RegexOptions.IgnoreCase;

            // \r\n\z
            var crlfEnd = new RegexNode(RegexNodeKind.Concatenate, lookaheadOpts);
            crlfEnd.AddChild(new RegexNode(RegexNodeKind.One, lookaheadOptsNoCase, '\r'));
            crlfEnd.AddChild(new RegexNode(RegexNodeKind.One, lookaheadOptsNoCase, '\n'));
            crlfEnd.AddChild(new RegexNode(RegexNodeKind.End, lookaheadOpts));

            // [\n\r\v\f\u0085\u2028\u2029]?\z — optional any-newline followed by end of string
            var anyNewLineOptEnd = new RegexNode(RegexNodeKind.Concatenate, lookaheadOpts);
            anyNewLineOptEnd.AddChild(new RegexNode(RegexNodeKind.Set, lookaheadOptsNoCase, RegexCharClass.AnyNewLineClass).MakeQuantifier(false, 0, 1));
            anyNewLineOptEnd.AddChild(new RegexNode(RegexNodeKind.End, lookaheadOpts));

            // (?=\r\n\z|[\n\r\v\f\u0085\u2028\u2029]?\z)
            var innerAlt = new RegexNode(RegexNodeKind.Alternate, lookaheadOpts);
            innerAlt.AddChild(crlfEnd);
            innerAlt.AddChild(anyNewLineOptEnd);
            var lookahead = new RegexNode(RegexNodeKind.PositiveLookaround, lookaheadOpts);
            lookahead.AddChild(innerAlt);

            // (?=\r\n\z|[\n\r\v\f\u0085\u2028\u2029]?\z)(?!(?<=\r)\n)
            var result = new RegexNode(RegexNodeKind.Concatenate, opts);
            result.AddChild(lookahead);
            result.AddChild(AnyNewLineCrLfGuardNode());

            return result;
        }

        /// <summary>
        /// Builds a tree equivalent to $ with AnyNewLine and Multiline.
        /// Matches before any newline or at end of string, but not between \r and \n.
        /// Equivalent to: (?=[\n\r\v\f\u0085\u2028\u2029]|\z)(?!(?&lt;=\r)\n)
        /// </summary>
        private RegexNode AnyNewLineEolNode()
        {
            RegexOptions opts = _options;
            RegexOptions lookaheadOpts = opts & ~RegexOptions.RightToLeft;
            RegexOptions lookaheadOptsNoCase = lookaheadOpts & ~RegexOptions.IgnoreCase;

            // (?=[\n\r\v\f\u0085\u2028\u2029]|\z)
            var innerAlt = new RegexNode(RegexNodeKind.Alternate, lookaheadOpts);
            innerAlt.AddChild(new RegexNode(RegexNodeKind.Set, lookaheadOptsNoCase, RegexCharClass.AnyNewLineClass));
            innerAlt.AddChild(new RegexNode(RegexNodeKind.End, lookaheadOpts));
            var lookahead = new RegexNode(RegexNodeKind.PositiveLookaround, lookaheadOpts);
            lookahead.AddChild(innerAlt);

            // (?=[\n\r\v\f\u0085\u2028\u2029]|\z)(?!(?<=\r)\n)
            var result = new RegexNode(RegexNodeKind.Concatenate, opts);
            result.AddChild(lookahead);
            result.AddChild(AnyNewLineCrLfGuardNode());

            return result;
        }

        /// <summary>
        /// Builds a tree equivalent to ^ with AnyNewLine and Multiline.
        /// Matches after any newline or at start of string, but not between \r and \n.
        /// Equivalent to: (?&lt;=[\n\r\v\f\u0085\u2028\u2029]|\A)(?!(?&lt;=\r)\n)
        /// </summary>
        private RegexNode AnyNewLineBolNode()
        {
            RegexOptions opts = _options;
            RegexOptions lookbehindOpts = opts | RegexOptions.RightToLeft;
            RegexOptions lookbehindOptsNoCase = lookbehindOpts & ~RegexOptions.IgnoreCase;

            // (?<=[\n\r\v\f\u0085\u2028\u2029]|\A)
            var innerAlt = new RegexNode(RegexNodeKind.Alternate, lookbehindOpts);
            innerAlt.AddChild(new RegexNode(RegexNodeKind.Set, lookbehindOptsNoCase, RegexCharClass.AnyNewLineClass));
            innerAlt.AddChild(new RegexNode(RegexNodeKind.Beginning, lookbehindOpts));
            var lookbehind = new RegexNode(RegexNodeKind.PositiveLookaround, lookbehindOpts);
            lookbehind.AddChild(innerAlt);

            // (?<=[\n\r\v\f\u0085\u2028\u2029]|\A)(?!(?<=\r)\n)
            var result = new RegexNode(RegexNodeKind.Concatenate, opts);
            result.AddChild(lookbehind);
            result.AddChild(AnyNewLineCrLfGuardNode());

            return result;
        }

        /// <summary>Returns the node kind for zero-length assertions with a \ code.</summary>
        private readonly RegexNodeKind TypeFromCode(char ch) =>
            ch switch
            {
                'b' => (_options & RegexOptions.ECMAScript) != 0 ? RegexNodeKind.ECMABoundary : RegexNodeKind.Boundary,
                'B' => (_options & RegexOptions.ECMAScript) != 0 ? RegexNodeKind.NonECMABoundary : RegexNodeKind.NonBoundary,
                'A' => RegexNodeKind.Beginning,
                'G' => RegexNodeKind.Start,
                'Z' => RegexNodeKind.EndZ,
                'z' => RegexNodeKind.End,
                _ => RegexNodeKind.Nothing,
            };

        /// <summary>
        /// A prescanner for deducing the slots used for captures by doing a partial tokenization of the pattern.
        /// </summary>
        private void CountCaptures(out RegexOptions optionsFoundInPattern)
        {
            NoteCaptureSlot(0, 0);
            optionsFoundInPattern = RegexOptions.None;
            _autocap = 1;

            while (_pos < _pattern.Length)
            {
                int pos = _pos;
                char ch = _pattern[_pos++];
                switch (ch)
                {
                    case '\\':
                        if (_pos < _pattern.Length)
                        {
                            ScanBackslash(scanOnly: true);
                        }
                        break;

                    case '#':
                        if ((_options & RegexOptions.IgnorePatternWhitespace) != 0)
                        {
                            --_pos;
                            ScanBlank();
                        }
                        break;

                    case '[':
                        ScanCharClass(caseInsensitive: false, scanOnly: true);
                        break;

                    case ')':
                        if (_optionsStack.Length != 0)
                        {
                            _options = (RegexOptions)_optionsStack.Pop();
                        }
                        break;

                    case '(':
                        if (_pos + 1 < _pattern.Length && _pattern[_pos + 1] == '#' && _pattern[_pos] == '?')
                        {
                            // we have a comment (?#
                            --_pos;
                            ScanBlank();
                        }
                        else
                        {
                            _optionsStack.Append((int)_options);
                            if (_pos < _pattern.Length && _pattern[_pos] == '?')
                            {
                                // we have (?...
                                _pos++;

                                if (_pos + 1 < _pattern.Length && (_pattern[_pos] == '<' || _pattern[_pos] == '\''))
                                {
                                    // named group: (?<... or (?'...

                                    _pos++;
                                    ch = _pattern[_pos];

                                    if (ch != '0' && RegexCharClass.IsBoundaryWordChar(ch))
                                    {
                                        if ((uint)(ch - '1') <= '9' - '1')
                                        {
                                            NoteCaptureSlot(ScanDecimal(), pos);
                                        }
                                        else
                                        {
                                            NoteCaptureName(ScanCapname(), pos);
                                        }
                                    }
                                }
                                else
                                {
                                    // (?...

                                    // get the options if it's an option construct (?cimsx-cimsx...)
                                    ScanOptions();
                                    optionsFoundInPattern |= _options;

                                    if (_pos < _pattern.Length)
                                    {
                                        if (_pattern[_pos] == ')')
                                        {
                                            // (?cimsx-cimsx)
                                            _pos++;
                                            _optionsStack.Length--;
                                        }
                                        else if (_pattern[_pos] == '(')
                                        {
                                            // alternation construct: (?(foo)yes|no)
                                            // ignore the next paren so we don't capture the condition
                                            _ignoreNextParen = true;

                                            // break from here so we don't reset _ignoreNextParen
                                            break;
                                        }
                                    }
                                }
                            }
                            else
                            {
                                // Simple (unnamed) capture group.
                                // Add unnamed parentheses if ExplicitCapture is not set
                                // and the next parentheses is not ignored.
                                if ((_options & RegexOptions.ExplicitCapture) == 0 && !_ignoreNextParen)
                                {
                                    NoteCaptureSlot(_autocap++, pos);
                                }
                            }
                        }

                        _ignoreNextParen = false;
                        break;
                }
            }

            AssignNameSlots();
        }

        /// <summary>Notes a used capture slot</summary>
        private void NoteCaptureSlot(int i, int pos)
        {
            object boxedI = i; // workaround to remove a boxed int when adding to the hashtable
            if (!_caps.ContainsKey(boxedI))
            {
                // the rhs of the hashtable isn't used in the parser

                _caps.Add(boxedI, pos);
                _capcount++;

                if (_captop <= i)
                {
                    _captop = i == int.MaxValue ? i : i + 1;
                }
            }
        }

        /// <summary>Notes a used capture slot</summary>
        private void NoteCaptureName(string name, int pos)
        {
            if (_capnames == null)
            {
                _capnames = new Hashtable();
                _capnamelist = new List<string>();
            }

            if (!_capnames.ContainsKey(name))
            {
                _capnames.Add(name, pos);
                _capnamelist!.Add(name);
            }
        }

        /// <summary>Assigns unused slot numbers to the capture names.</summary>
        private void AssignNameSlots()
        {
            if (_capnames != null)
            {
                for (int i = 0; i < _capnamelist!.Count; i++)
                {
                    while (IsCaptureSlot(_autocap))
                    {
                        _autocap++;
                    }

                    string name = _capnamelist[i];
                    int pos = (int)_capnames[name]!;
                    _capnames[name] = _autocap;
                    NoteCaptureSlot(_autocap, pos);

                    _autocap++;
                }
            }

            // if the caps array has at least one gap, construct the list of used slots

            if (_capcount < _captop)
            {
                _capnumlist = new int[_capcount];
                int i = 0;

                // Manual use of IDictionaryEnumerator instead of foreach to avoid DictionaryEntry box allocations.
                IDictionaryEnumerator de = _caps.GetEnumerator();
                while (de.MoveNext())
                {
                    _capnumlist[i++] = (int)de.Key;
                }

                Array.Sort(_capnumlist);
            }

            // merge capsnumlist into capnamelist

            if (_capnames != null || _capnumlist != null)
            {
                List<string>? oldcapnamelist;
                int next;
                int k = 0;

                if (_capnames == null)
                {
                    oldcapnamelist = null;
                    _capnames = new Hashtable();
                    _capnamelist = new List<string>();
                    next = -1;
                }
                else
                {
                    oldcapnamelist = _capnamelist;
                    _capnamelist = new List<string>();
                    next = (int)_capnames[oldcapnamelist![0]]!;
                }

                for (int i = 0; i < _capcount; i++)
                {
                    int j = (_capnumlist == null) ? i : _capnumlist[i];

                    if (next == j)
                    {
                        _capnamelist.Add(oldcapnamelist![k++]);
                        next = (k == oldcapnamelist.Count) ? -1 : (int)_capnames[oldcapnamelist[k]]!;
                    }
                    else
                    {
                        string str = j.ToString(_culture);
                        _capnamelist.Add(str);
                        _capnames[str] = j;
                    }
                }
            }
        }

        /// <summary>True if the capture slot was noted</summary>
        private readonly bool IsCaptureSlot(int i)
        {
            if (_caps != null)
            {
                return _caps.ContainsKey(i);
            }

            return i >= 0 && i < _capsize;
        }

        /// <summary>
        /// When generating code on a regex that uses a sparse set
        /// of capture slots, we hash them to a dense set of indices
        /// for an array of capture slots. Instead of doing the hash
        /// at match time, it's done at compile time, here.
        /// </summary>
        internal static int MapCaptureNumber(int capnum, Hashtable? caps) =>
            capnum == -1 ? -1 :
            caps != null ? (int)caps[capnum]! :
            capnum;

        private const byte Q = 4;    // quantifier          * + ? {
        private const byte S = 3;    // stopper             $ ( ) . [ \ ^ |
        private const byte Z = 2;    // # stopper           #
        private const byte W = 1;    // whitespace          \t \n \f \r ' '

        /// <summary>For categorizing ASCII characters.</summary>
        private static ReadOnlySpan<byte> Category =>
        [
            // 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
               0, 0, 0, 0, 0, 0, 0, 0, 0, W, W, W, W, W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            //    !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
               W, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
            // @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
            // '  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0
        ];

        private static readonly SearchValues<char> s_metachars =
            SearchValues.Create("\t\n\f\r #$()*+.?[\\^{|");

        /// <summary>Returns true for those characters that terminate a string of ordinary chars.</summary>
        private static bool IsSpecial(char ch) => ch <= '|' && Category[ch] >= S;

        /// <summary>Returns true for those characters including whitespace that terminate a string of ordinary chars.</summary>
        private static bool IsSpecialOrSpace(char ch) => ch <= '|' && Category[ch] >= W;

        /// <summary>Returns true for those characters that begin a quantifier.</summary>
        private static bool IsQuantifier(char ch) => ch <= '{' && Category[ch] == Q;

        /// <summary>Returns true for whitespace.</summary>
        private static bool IsSpace(char ch) => ch <= ' ' && Category[ch] == W;

        private readonly bool IsTrueQuantifier()
        {
            Debug.Assert(_pos < _pattern.Length, "The current reading position must not be at the end of the pattern");

            int startpos = _pos;
            char ch = _pattern[startpos];
            if (ch != '{')
            {
                return ch <= '{' && Category[ch] >= Q;
            }

            int pos = startpos;
            int nChars = _pattern.Length - _pos;
            while (--nChars > 0 && (uint)((ch = _pattern[++pos]) - '0') <= 9) ;

            if (nChars == 0 || pos - startpos == 1)
            {
                return false;
            }

            if (ch == '}')
            {
                return true;
            }

            if (ch != ',')
            {
                return false;
            }

            while (--nChars > 0 && (uint)((ch = _pattern[++pos]) - '0') <= 9) ;

            return nChars > 0 && ch == '}';
        }

        /// <summary>Add a string to the last concatenate.</summary>
        private void AddToConcatenate(int pos, int cch, bool isReplacement)
        {
            switch (cch)
            {
                case 0:
                    return;

                case 1:
                    _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture, ref _caseBehavior));
                    break;

                case > 1 when (_options & RegexOptions.IgnoreCase) == 0 || isReplacement || !RegexCharClass.ParticipatesInCaseConversion(_pattern.AsSpan(pos, cch)):
                    _concatenation!.AddChild(new RegexNode(RegexNodeKind.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch)));
                    break;

                default:
                    foreach (char c in _pattern.AsSpan(pos, cch))
                    {
                        _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(c, _options, _culture, ref _caseBehavior));
                    }
                    break;
            }
        }

        /// <summary>Push the parser state (in response to an open paren)</summary>
        private void PushGroup()
        {
            _group!.Parent = _stack;
            _alternation!.Parent = _group;
            _concatenation!.Parent = _alternation;
            _stack = _concatenation;
        }

        /// <summary>Remember the pushed state (in response to a ')')</summary>
        private void PopGroup()
        {
            _concatenation = _stack;
            _alternation = _concatenation!.Parent;
            _group = _alternation!.Parent;
            _stack = _group!.Parent;

            // The first () inside a Testgroup group goes directly to the group
            if (_group.Kind == RegexNodeKind.ExpressionConditional && _group.ChildCount() == 0)
            {
                if (_unit == null)
                {
                    throw MakeException(RegexParseError.AlternationHasMalformedCondition, SR.AlternationHasMalformedCondition);
                }

                _group.AddChild(_unit);
                _unit = null;
            }
        }

        /// <summary>Start a new round for the parser state (in response to an open paren or string start)</summary>
        private void StartGroup(RegexNode openGroup)
        {
            _group = openGroup;
            _alternation = new RegexNode(RegexNodeKind.Alternate, _options);
            _concatenation = new RegexNode(RegexNodeKind.Concatenate, _options);
        }

        /// <summary>Finish the current concatenation (in response to a |)</summary>
        private void AddAlternate()
        {
            // The | parts inside a Testgroup group go directly to the group

            if (_group!.Kind is RegexNodeKind.ExpressionConditional or RegexNodeKind.BackreferenceConditional)
            {
                _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
            }
            else
            {
                _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
            }

            _concatenation = new RegexNode(RegexNodeKind.Concatenate, _options);
        }

        /// <summary>Finish the current group (in response to a ')' or end)</summary>
        private void AddGroup()
        {
            if (_group!.Kind is RegexNodeKind.ExpressionConditional or RegexNodeKind.BackreferenceConditional)
            {
                _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());

                if (_group.Kind == RegexNodeKind.BackreferenceConditional && _group.ChildCount() > 2 || _group.ChildCount() > 3)
                {
                    throw MakeException(RegexParseError.AlternationHasTooManyConditions, SR.AlternationHasTooManyConditions);
                }
            }
            else
            {
                _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
                _group.AddChild(_alternation);
            }

            _unit = _group;
        }

        /// <summary>Fills in a RegexParseException</summary>
        private readonly RegexParseException MakeException(RegexParseError error, string message) =>
            new RegexParseException(error, _pos, SR.Format(SR.MakeException, _pattern, _pos, message));

        /// <summary>Gets group name from its number.</summary>
        internal static string GroupNameFromNumber(Hashtable? caps, string[]? capslist, int capsize, int i)
        {
            if (capslist is null)
            {
                if ((uint)i < (uint)capsize)
                {
                    return ((uint)i).ToString();
                }
            }
            else
            {
                if ((caps is null || caps.TryGetValue(i, out i)) &&
                    (uint)i < (uint)capslist.Length)
                {
                    return capslist[i];
                }
            }

            return string.Empty;
        }
    }
}
File: System\Text\RegularExpressions\RegexParser.cs	Web Access
Project: src\runtime\src\libraries\System.Text.RegularExpressions\src\System.Text.RegularExpressions.csproj (System.Text.RegularExpressions)