File: System\Text\RegularExpressions\RegexCharClass.cs
Web Access
Project: src\src\libraries\System.Text.RegularExpressions\src\System.Text.RegularExpressions.csproj (System.Text.RegularExpressions)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Threading;
 
namespace System.Text.RegularExpressions
{
    // The main function of RegexCharClass is as a builder to turn ranges, characters and
    // Unicode categories into a single string.  This string is used as a black box
    // representation of a character class by the rest of Regex.  The format is as follows.
    //
    // Char index   Use
    //      0       Flags - currently this only holds the "negate" flag
    //      1       length of the string representing the "set" portion, e.g. [a-z0-9] only has a "set"
    //      2       length of the string representing the "category" portion, e.g. [\p{Lu}] only has a "category"
    //      3...m   The set.  These are a series of ranges which define the characters included in the set.
    //              To determine if a given character is in the set, we binary search over this set of ranges
    //              and see where the character should go.  Based on whether the ending index is odd or even,
    //              we know if the character is in the set.
    //      m+1...n The categories.  This is a list of UnicodeCategory enum values which describe categories
    //              included in this class. These can either be individual values (either UnicodeCategory - 1
    //              for inclusive values, or -1 - UnicodeCategory for exclusive values), or a "group", which
    //              is a contiguous sequence of such values surrounded by \0 values; all values in the group
    //              have the same positive/negative orientation.
 
    /// <summary>Provides the "set of Unicode chars" functionality used by the regexp engine.</summary>
    internal sealed partial class RegexCharClass
    {
        // Constants
        internal const int FlagsIndex = 0;
        internal const int SetLengthIndex = 1;
        internal const int CategoryLengthIndex = 2;
        internal const int SetStartIndex = 3; // must be odd for subsequent logic to work
 
        internal const char LastChar = '\uFFFF';
 
        internal const short SpaceConst = 100;
        private const short NotSpaceConst = -100;
 
        private const string InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__";
        private const string SpaceCategories = "\x64";
        private const string NotSpaceCategories = "\uFF9C";
        private const string WordCategories = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000";
        private const string NotWordCategories = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000";
 
        internal const string SpaceClass = "\u0000\u0000\u0001\u0064"; // \s
        internal const string NotSpaceClass = "\u0000\u0000\u0001\uFF9C"; // \S
        internal const string NegatedSpaceClass = "\u0001\0\u0001d"; // [^\s]
        internal const string WordClass = "\u0000\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; // \w
        internal const string NotWordClass = "\u0000\u0000\u000A\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; // \W
        internal const string NegatedWordClass = "\u0001\0\n\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0"; // [^\w]
        internal const string DigitClass = "\u0000\u0000\u0001\u0009"; // \d
        internal const string NotDigitClass = "\u0000\u0000\u0001\uFFF7"; // \D
        internal const string NegatedDigitClass = "\u0001\0\u0001\t"; // [^\d]
        internal const string ControlClass = "\0\0\u0001\u000f"; // \p{Cc}
        internal const string NotControlClass = "\0\0\u0001\ufff1"; // \P{Cc}
        internal const string LetterClass = "\0\0\a\0\u0002\u0004\u0005\u0003\u0001\0"; // \p{L}
        internal const string NotLetterClass = "\0\0\u0007\0\ufffe\ufffc\ufffb\ufffd\uffff\0"; // \P{L}
        internal const string LetterOrDigitClass = "\0\0\b\0\u0002\u0004\u0005\u0003\u0001\0\t"; // [\p{L}\d]
        internal const string NotLetterOrDigitClass = "\u0001\0\b\0\u0002\u0004\u0005\u0003\u0001\0\t"; // [^\p{L}\d]
        internal const string LowerClass = "\0\0\u0001\u0002"; // \p{Ll}
        internal const string NotLowerClass = "\0\0\u0001\ufffe"; // \P{Ll}
        internal const string UpperClass = "\0\0\u0001\u0001"; // \p{Lu}
        internal const string NotUpperClass = "\0\0\u0001\uffff"; // \P{Lu}
        internal const string NumberClass = "\0\0\u0005\0\t\n\v\0"; // \p{N}
        internal const string NotNumberClass = "\0\0\u0005\0\ufff7\ufff6\ufff5\0"; // \P{N}
        internal const string PunctuationClass = "\0\0\t\0\u0013\u0014\u0016\u0019\u0015\u0018\u0017\0"; // \p{P}
        internal const string NotPunctuationClass = "\0\0\u0009\0\uffed\uffec\uffea\uffe7\uffeb\uffe8\uffe9\0"; // \P{P}
        internal const string SeparatorClass = "\0\0\u0005\0\r\u000e\f\0"; // \p{Z}
        internal const string NotSeparatorClass = "\0\0\u0005\0\ufff3\ufff2\ufff4\0"; // \P{Z}
        internal const string SymbolClass = "\0\0\u0006\0\u001b\u001c\u001a\u001d\0"; // \p{S}
        internal const string NotSymbolClass = "\0\0\u0006\0\uffe5\uffe4\uffe6\uffe3\0"; // \P{S}
        internal const string AsciiLetterClass = "\0\u0004\0A[a{"; // [A-Za-z]
        internal const string NotAsciiLetterClass = "\u0001\u0004\0A[a{"; // [^A-Za-z]
        internal const string AsciiLetterOrDigitClass = "\0\u0006\00:A[a{"; // [A-Za-z0-9]
        internal const string NotAsciiLetterOrDigitClass = "\u0001\u0006\00:A[a{"; // [^A-Za-z0-9]
        internal const string HexDigitClass = "\0\u0006\00:AGag"; // [A-Fa-f0-9]
        internal const string NotHexDigitClass = "\u0001\u0006\00:AGag"; // [^A-Fa-f0-9]
        internal const string HexDigitUpperClass = "\0\u0004\00:AG"; // [A-F0-9]
        internal const string NotHexDigitUpperClass = "\u0001\u0004\00:AG"; // [A-F0-9]
        internal const string HexDigitLowerClass = "\0\u0004\00:ag"; // [a-f0-9]
        internal const string NotHexDigitLowerClass = "\u0001\u0004\00:ag"; // [a-f0-9]
 
        private const string ECMASpaceRanges = "\u0009\u000E\u0020\u0021";
        private const string NotECMASpaceRanges = "\0\u0009\u000E\u0020\u0021";
        private const string ECMAWordRanges = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
        private const string NotECMAWordRanges = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
        private const string ECMADigitRanges = "\u0030\u003A";
        private const string NotECMADigitRanges = "\0\u0030\u003A";
 
        internal const string ECMASpaceClass = "\x00\x04\x00" + ECMASpaceRanges;
        internal const string NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceRanges;
        internal const string ECMAWordClass = "\x00\x0A\x00" + ECMAWordRanges;
        internal const string NotECMAWordClass = "\x01\x0A\x00" + ECMAWordRanges;
        internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitRanges;
        internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitRanges;
 
        internal const string NotNewLineClass = "\x01\x02\x00\x0A\x0B";
        internal const string NotAnyNewLineClass = "\x01\x06\x00\x0A\x0E\x85\x86\u2028\u202A";
        /// <summary>Character class for [\n\v\f\r\u0085\u2028\u2029] — all AnyNewLine chars, used in anchor lowering.</summary>
        internal const string AnyNewLineClass = "\x00\x06\x00\x0A\x0E\x85\x86\u2028\u202A";
 
        internal const string AnyClass = "\x00\x01\x00\x00";
        private const string EmptyClass = "\x00\x00\x00";
 
        // Sets regularly used as a canonical way to express the equivalent of '.' with Singleline when Singleline isn't in use.
        internal const string WordNotWordClass = "\u0000\u0000\u0014\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000";
        internal const string NotWordWordClass = "\u0000\u0000\u0014\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000";
        internal const string DigitNotDigitClass = "\u0000\u0000\u0002\u0009\uFFF7";
        internal const string NotDigitDigitClass = "\u0000\u0000\u0002\uFFF7\u0009";
        internal const string SpaceNotSpaceClass = "\u0000\u0000\u0002\u0064\uFF9C";
        internal const string NotSpaceSpaceClass = "\u0000\u0000\u0002\uFF9C\u0064";
 
        // UnicodeCategory is zero based, so we add one to each value and subtract it off later
        private const int DefinedCategoriesCapacity = 38;
        private static readonly Dictionary<string, string> s_definedCategories = new Dictionary<string, string>(DefinedCategoriesCapacity)
        {
            // Others
            { "Cc", "\u000F" }, // UnicodeCategory.Control + 1
            { "Cf", "\u0010" }, // UnicodeCategory.Format + 1
            { "Cn", "\u001E" }, // UnicodeCategory.OtherNotAssigned + 1
            { "Co", "\u0012" }, // UnicodeCategory.PrivateUse + 1
            { "Cs", "\u0011" }, // UnicodeCategory.Surrogate + 1
            { "C", "\u0000\u000F\u0010\u001E\u0012\u0011\u0000" },
 
            // Letters
            { "Ll", "\u0002" }, // UnicodeCategory.LowercaseLetter + 1
            { "Lm", "\u0004" }, // UnicodeCategory.ModifierLetter + 1
            { "Lo", "\u0005" }, // UnicodeCategory.OtherLetter + 1
            { "Lt", "\u0003" }, // UnicodeCategory.TitlecaseLetter + 1
            { "Lu", "\u0001" }, // UnicodeCategory.UppercaseLetter + 1
            { "L", "\u0000\u0002\u0004\u0005\u0003\u0001\u0000" },
 
            // InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter}
            // !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!!
            { InternalRegexIgnoreCase, "\u0000\u0002\u0003\u0001\u0000" },
 
            // Marks
            { "Mc", "\u0007" }, // UnicodeCategory.SpacingCombiningMark + 1
            { "Me", "\u0008" }, // UnicodeCategory.EnclosingMark + 1
            { "Mn", "\u0006" }, // UnicodeCategory.NonSpacingMark + 1
            { "M", "\u0000\u0007\u0008\u0006\u0000" },
 
            // Numbers
            { "Nd", "\u0009" }, // UnicodeCategory.DecimalDigitNumber + 1
            { "Nl", "\u000A" }, // UnicodeCategory.LetterNumber + 1
            { "No", "\u000B" }, // UnicodeCategory.OtherNumber + 1
            { "N", "\u0000\u0009\u000A\u000B\u0000" },
 
            // Punctuation
            { "Pc", "\u0013" }, // UnicodeCategory.ConnectorPunctuation + 1
            { "Pd", "\u0014" }, // UnicodeCategory.DashPunctuation + 1
            { "Pe", "\u0016" }, // UnicodeCategory.ClosePunctuation + 1
            { "Po", "\u0019" }, // UnicodeCategory.OtherPunctuation + 1
            { "Ps", "\u0015" }, // UnicodeCategory.OpenPunctuation + 1
            { "Pf", "\u0018" }, // UnicodeCategory.FinalQuotePunctuation + 1
            { "Pi", "\u0017" }, // UnicodeCategory.InitialQuotePunctuation + 1
            { "P", "\u0000\u0013\u0014\u0016\u0019\u0015\u0018\u0017\u0000" },
 
            // Symbols
            { "Sc", "\u001B" }, // UnicodeCategory.CurrencySymbol + 1
            { "Sk", "\u001C" }, // UnicodeCategory.ModifierSymbol + 1
            { "Sm", "\u001A" }, // UnicodeCategory.MathSymbol + 1
            { "So", "\u001D" }, // UnicodeCategory.OtherSymbol + 1
            { "S", "\u0000\u001B\u001C\u001A\u001D\u0000" },
 
            // Separators
            { "Zl", "\u000D" }, // UnicodeCategory.LineSeparator + 1
            { "Zp", "\u000E" }, // UnicodeCategory.ParagraphSeparator + 1
            { "Zs", "\u000C" }, // UnicodeCategory.SpaceSeparator + 1
            { "Z", "\u0000\u000D\u000E\u000C\u0000" },
        };
 
        private static readonly char[] s_whitespaceChars =
            ['\u0009', '\u000A', '\u000B', '\u000C', '\u000D',
             '\u0020', '\u0085', '\u00A0', '\u1680', '\u2000',
             '\u2001', '\u2002', '\u2003', '\u2004', '\u2005',
             '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
             '\u2028', '\u2029', '\u202F', '\u205F', '\u3000'];
 
        private List<(char First, char Last)>? _rangelist;
        private StringBuilder? _categories;
        private RegexCharClass? _subtractor;
        private bool _negate;
        private RegexCaseBehavior _caseBehavior;
 
#if DEBUG
        static RegexCharClass()
        {
            // Make sure the initial capacity for s_definedCategories is correct
            Debug.Assert(
                s_definedCategories.Count == DefinedCategoriesCapacity,
                $"Expected (s_definedCategories.Count): {s_definedCategories.Count}, Actual (DefinedCategoriesCapacity): {DefinedCategoriesCapacity}");
 
            // Make sure the s_propTable is correctly ordered
            int len = s_propTable.Length;
            for (int i = 0; i < len - 1; i++)
                Debug.Assert(string.Compare(s_propTable[i][0], s_propTable[i + 1][0], StringComparison.Ordinal) < 0, $"RegexCharClass s_propTable is out of order at ({s_propTable[i][0]}, {s_propTable[i + 1][0]})");
 
            // Make sure character information is in sync with Unicode data.
            var whitespaceSet = new HashSet<char>(s_whitespaceChars);
            for (int i = 0; i <= char.MaxValue; i++)
            {
                Debug.Assert(whitespaceSet.Contains((char)i) == char.IsWhiteSpace((char)i));
            }
        }
#endif
 
        /// <summary>
        /// Creates an empty character class.
        /// </summary>
        public RegexCharClass()
        {
        }
 
        private RegexCharClass(bool negate, List<(char First, char Last)>? ranges, StringBuilder? categories, RegexCharClass? subtraction)
        {
            _rangelist = ranges;
            _categories = categories;
            _negate = negate;
            _subtractor = subtraction;
        }
 
        public bool CanMerge => !_negate && _subtractor == null;
 
        public bool Negate
        {
            set { _negate = value; }
        }
 
        public void AddChar(char c) => AddRange(c, c);
 
        /// <summary>
        /// Adds a regex char class
        /// </summary>
        public void AddCharClass(RegexCharClass cc)
        {
            Debug.Assert(cc.CanMerge && CanMerge, "Both character classes added together must be able to merge");
 
            int ccRangeCount = cc._rangelist?.Count ?? 0;
 
            if (ccRangeCount != 0)
            {
                EnsureRangeList().AddRange(cc._rangelist!);
            }
 
            if (cc._categories != null)
            {
                EnsureCategories().Append(cc._categories);
            }
        }
 
        /// <summary>Adds a regex char class if the classes are mergeable.</summary>
        public bool TryAddCharClass(RegexCharClass cc)
        {
            if (cc.CanMerge && CanMerge)
            {
                AddCharClass(cc);
                return true;
            }
 
            return false;
        }
 
        private StringBuilder EnsureCategories() =>
            _categories ??= new StringBuilder();
 
        private List<(char First, char Last)> EnsureRangeList() =>
            _rangelist ??= new List<(char First, char Last)>(6);
 
        /// <summary>
        /// Adds ranges (specified by their range string representation) to the class.
        /// </summary>
        private void AddRanges(ReadOnlySpan<char> set)
        {
            Debug.Assert(!set.IsEmpty);
 
            List<(char First, char Last)> rangeList = EnsureRangeList();
 
            int i;
            for (i = 0; i < set.Length - 1; i += 2)
            {
                rangeList.Add((set[i], (char)(set[i + 1] - 1)));
            }
 
            if (i < set.Length)
            {
                rangeList.Add((set[i], LastChar));
            }
        }
 
        public void AddSubtraction(RegexCharClass sub)
        {
            Debug.Assert(_subtractor == null, "Can't add two subtractions to a char class. ");
            _subtractor = sub;
        }
 
        /// <summary>
        /// Adds a single range of characters to the class.
        /// </summary>
        public void AddRange(char first, char last) =>
            EnsureRangeList().Add((first, last));
 
        public void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern, int currentPos)
        {
            if (s_definedCategories.TryGetValue(categoryName, out string? category) &&
                !categoryName.Equals(InternalRegexIgnoreCase))
            {
                if (caseInsensitive && (categoryName.Equals("Ll") || categoryName.Equals("Lu") || categoryName.Equals("Lt")))
                {
                    // when RegexOptions.IgnoreCase is specified then {Ll}, {Lu}, and {Lt} cases should all match
                    category = s_definedCategories[InternalRegexIgnoreCase];
                }
 
                StringBuilder categories = EnsureCategories();
                if (invert)
                {
                    // Negate category
                    for (int i = 0; i < category.Length; i++)
                    {
                        short ch = (short)category[i];
                        categories.Append((char)-ch);
                    }
                }
                else
                {
                    categories.Append(category);
                }
            }
            else
            {
                AddRanges(RangesFromProperty(categoryName, invert, pattern, currentPos));
            }
        }
 
        private void AddCategory(string category) => EnsureCategories().Append(category);
 
        /// <summary>
        /// Adds to the class any case-equivalence versions of characters already
        /// in the class. Used for case-insensitivity.
        /// </summary>
        public void AddCaseEquivalences(CultureInfo culture)
        {
            List<(char First, char Last)>? rangeList = _rangelist;
            if (rangeList != null)
            {
                int count = rangeList.Count;
                for (int i = 0; i < count; i++)
                {
                    (char First, char Last) range = rangeList[i];
                    if (range.First == range.Last)
                    {
                        if (RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior(range.First, culture, ref _caseBehavior, out ReadOnlySpan<char> equivalences))
                        {
                            foreach (char equivalence in equivalences)
                            {
                                AddChar(equivalence);
                            }
                        }
                    }
                    else
                    {
                        AddCaseEquivalenceRange(range.First, range.Last, culture);
                    }
                }
            }
        }
 
        /// <summary>
        /// For a single range that's in the set, adds any additional ranges
        /// necessary to ensure that lowercase equivalents are also included.
        /// </summary>
        private void AddCaseEquivalenceRange(char chMin, char chMax, CultureInfo culture)
        {
            for (int i = chMin; i <= chMax; i++)
            {
                if (RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior((char)i, culture, ref _caseBehavior, out ReadOnlySpan<char> equivalences))
                {
                    foreach (char equivalence in equivalences)
                    {
                        AddChar(equivalence);
                    }
                }
            }
        }
 
        public void AddWord(bool ecma, bool negate)
        {
            if (ecma)
            {
                AddRanges((negate ? NotECMAWordRanges : ECMAWordRanges).AsSpan());
            }
            else
            {
                AddCategory(negate ? NotWordCategories : WordCategories);
            }
        }
 
        public void AddSpace(bool ecma, bool negate)
        {
            if (ecma)
            {
                AddRanges((negate ? NotECMASpaceRanges : ECMASpaceRanges).AsSpan());
            }
            else
            {
                AddCategory(negate ? NotSpaceCategories : SpaceCategories);
            }
        }
 
        public void AddDigit(bool ecma, bool negate, string pattern, int currentPos)
        {
            if (ecma)
            {
                AddRanges((negate ? NotECMADigitRanges : ECMADigitRanges).AsSpan());
            }
            else
            {
                AddCategoryFromName("Nd", negate, caseInsensitive: false, pattern, currentPos);
            }
        }
 
        public static string ConvertOldStringsToClass(string set, string category)
        {
            bool startsWithNulls = set.Length >= 2 && set[0] == '\0' && set[1] == '\0';
            int strLength = SetStartIndex + set.Length + category.Length;
            if (startsWithNulls)
            {
                strLength -= 2;
            }
 
            return
#if NET
                string
#else
                StringExtensions
#endif
                .Create(strLength, (set, category, startsWithNulls), static (span, state) =>
            {
                int index;
 
                if (state.startsWithNulls)
                {
                    span[FlagsIndex] = (char)0x1;
                    span[SetLengthIndex] = (char)(state.set.Length - 2);
                    span[CategoryLengthIndex] = (char)state.category.Length;
                    state.set.AsSpan(2).CopyTo(span.Slice(SetStartIndex));
                    index = SetStartIndex + state.set.Length - 2;
                }
                else
                {
                    span[FlagsIndex] = '\0';
                    span[SetLengthIndex] = (char)state.set.Length;
                    span[CategoryLengthIndex] = (char)state.category.Length;
                    state.set.AsSpan().CopyTo(span.Slice(SetStartIndex));
                    index = SetStartIndex + state.set.Length;
                }
 
                state.category.AsSpan().CopyTo(span.Slice(index));
            });
        }
 
        /// <summary>
        /// Returns the char
        /// </summary>
        public static char SingletonChar(string set)
        {
            Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
            return set[SetStartIndex];
        }
 
        public static bool IsMergeable(string charClass) =>
            charClass != null &&
            !IsNegated(charClass) &&
            !IsSubtraction(charClass);
 
        public static bool IsEmpty(string charClass) =>
            charClass[CategoryLengthIndex] == 0 &&
            charClass[SetLengthIndex] == 0 &&
            !IsNegated(charClass) &&
            !IsSubtraction(charClass);
 
        /// <summary><c>true</c> if the set contains a single character only</summary>
        /// <remarks>
        /// This will happen not only from character classes manually written to contain a single character,
        /// but much more frequently by the implementation/parser itself, e.g. when looking for \n as part of
        /// finding the end of a line, when processing an alternation like "hello|hithere" where the first
        /// character of both options is the same, etc.
        /// </remarks>
        public static bool IsSingleton(string set) =>
            set[CategoryLengthIndex] == 0 &&
            set[SetLengthIndex] == 2 &&
            !IsNegated(set) &&
            !IsSubtraction(set) &&
            (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
 
        public static bool IsSingletonInverse(string set) =>
            set[CategoryLengthIndex] == 0 &&
            set[SetLengthIndex] == 2 &&
            IsNegated(set) &&
            !IsSubtraction(set) &&
            (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
 
        /// <summary>
        /// Gets the categories from a set if the set is only categories (no ranges, no subtraction),
        /// they all share the same negation status (not doing so is rare), and they all fit in the destination span.
        /// </summary>
        /// <param name="set">The character class to examine.</param>
        /// <param name="categories">The destination span into which the categories should be written.</param>
        /// <param name="numCategories">The number of categories written to <paramref name="categories"/>.</param>
        /// <param name="negated">false if the categories written to <paramref name="categories"/> represent inclusions; true if they represent exclusions.</param>
        /// <returns>true if the categories could be retrieved; otherwise, false.</returns>
        public static bool TryGetOnlyCategories(string set, Span<UnicodeCategory> categories, out int numCategories, out bool negated)
        {
            negated = false;
            numCategories = 0;
            bool sawFirstCategory = false;
 
            // Require that the character class has no ranges, has no subtraction, and has categories.
            int categoryLength = set[CategoryLengthIndex];
            if (categoryLength == 0 || set[SetLengthIndex] != 0 || IsSubtraction(set))
            {
                return false;
            }
 
            // Loop through all categories, storing them into the categories span.
            int categoryEnd = SetStartIndex + set[CategoryLengthIndex];
            for (int pos = SetStartIndex; pos < categoryEnd; pos++)
            {
                // Get the next category value.
                short c = (short)set[pos];
                if (c > 0)
                {
                    // It's a positive (inclusive) value.  Make sure all previous categories seen are also positive.
                    // Also make sure it's not the fake space category, which consumers don't handle as it's
                    // not a real UnicodeCategory.
                    if ((sawFirstCategory && negated) ||
                        c == SpaceConst ||
                        numCategories == categories.Length)
                    {
                        return false;
                    }
 
                    sawFirstCategory = true;
                    categories[numCategories++] = (UnicodeCategory)(c - 1);
                }
                else if (c < 0)
                {
                    // It's a negative (exclusive) value.  Make sure all previous categories seen are also negative.
                    // Also make sure it's not the fake non-space category, which consumers don't handle as it's
                    // not a real UnicodeCategory.
                    if ((sawFirstCategory && !negated) ||
                        c == NotSpaceConst ||
                        numCategories == categories.Length)
                    {
                        return false;
                    }
 
                    sawFirstCategory = true;
                    negated = true;
                    categories[numCategories++] = (UnicodeCategory)(-1 - c);
                }
                else // c == 0
                {
                    // It's the start of a group. Every value in the group needs to have the same orientation.
                    // We stop when we hit the next 0.
                    c = (short)set[++pos];
                    Debug.Assert(c != 0);
                    if (c > 0)
                    {
                        if (sawFirstCategory && negated)
                        {
                            return false;
                        }
                        sawFirstCategory = true;
 
                        do
                        {
                            if (numCategories == categories.Length)
                            {
                                return false;
                            }
 
                            categories[numCategories++] = (UnicodeCategory)(c - 1);
                            c = (short)set[++pos];
                        }
                        while (c != 0);
                    }
                    else
                    {
                        if (sawFirstCategory && !negated)
                        {
                            return false;
                        }
                        negated = true;
                        sawFirstCategory = true;
 
                        do
                        {
                            if (numCategories == categories.Length)
                            {
                                return false;
                            }
 
                            categories[numCategories++] = (UnicodeCategory)(-1 - c);
                            c = (short)set[++pos];
                        }
                        while (c != 0);
                    }
                }
            }
 
            // Factor in whether the entire character class is itself negated.
            negated ^= IsNegated(set);
            return true;
        }
 
        /// <summary>Attempts to get a single range stored in the set.</summary>
        /// <param name="set">The set.</param>
        /// <param name="lowInclusive">The inclusive lower-bound of the range, if available.</param>
        /// <param name="highInclusive">The inclusive upper-bound of the range, if available.</param>
        /// <returns>true if the set contained a single range; otherwise, false.</returns>
        /// <remarks>
        /// <paramref name="lowInclusive"/> and <paramref name="highInclusive"/> will be equal if the
        /// range is a singleton or singleton inverse. The range will need to be negated by the caller
        /// if <see cref="IsNegated(string)"/> is true.
        /// </remarks>
        public static bool TryGetSingleRange(string set, out char lowInclusive, out char highInclusive)
        {
            if (set[CategoryLengthIndex] == 0 && // must not have any categories
                set.Length == SetStartIndex + set[SetLengthIndex]) // and no subtraction
            {
                switch ((int)set[SetLengthIndex])
                {
                    case 1:
                        lowInclusive = set[SetStartIndex];
                        highInclusive = LastChar;
                        return true;
 
                    case 2:
                        lowInclusive = set[SetStartIndex];
                        highInclusive = (char)(set[SetStartIndex + 1] - 1);
                        return true;
                }
            }
 
            lowInclusive = highInclusive = '\0';
            return false;
        }
 
        /// <summary>Attempts to get two ranges stored in the set.  The set may be negated.</summary>
        /// <param name="set">The set.</param>
        /// <param name="range0">The first result range.</param>
        /// <param name="range1">The second result range.</param>
        /// <returns>true if the set contained exactly two ranges; otherwise, false.</returns>
        public static bool TryGetDoubleRange(
            string set,
            out (char LowInclusive, char HighInclusive) range0,
            out (char LowInclusive, char HighInclusive) range1)
        {
            if (set[CategoryLengthIndex] == 0 && // must not have any categories
                set.Length == SetStartIndex + set[SetLengthIndex]) // and no subtraction
            {
                int setLength = set[SetLengthIndex];
                if (setLength is 3 or 4)
                {
                    range0 = (set[SetStartIndex], (char)(set[SetStartIndex + 1] - 1));
                    range1 = (set[SetStartIndex + 2], setLength == 3 ? LastChar : (char)(set[SetStartIndex + 3] - 1));
                    return true;
                }
            }
 
            range0 = range1 = ('\0', '\0');
            return false;
        }
 
        /// <summary>Gets all of the characters in the specified set, storing them into the provided span.</summary>
        /// <param name="set">The character class.</param>
        /// <param name="chars">The span into which the chars should be stored.</param>
        /// <returns>
        /// The number of stored chars.  If they won't all fit, 0 is returned.
        /// If 0 is returned, no assumptions can be made about the characters.
        /// </returns>
        /// <remarks>
        /// Only considers character classes that only contain sets (no categories),
        /// just simple sets containing starting/ending pairs (subtraction from those pairs
        /// is factored in, however).The returned characters may be negated: if IsNegated(set)
        /// is false, then the returned characters are the only ones that match; if it returns
        /// true, then the returned characters are the only ones that don't match.
        /// </remarks>
        public static int GetSetChars(string set, Span<char> chars)
        {
            // We get the characters by enumerating the set portion, so we validate that it's
            // set up to enable that, e.g. no categories.
            if (!CanEasilyEnumerateSetContents(set, out bool hasSubtraction))
            {
                return 0;
            }
 
            // Negation with subtraction is too cumbersome to reason about efficiently.
            if (hasSubtraction && IsNegated(set))
            {
                return 0;
            }
 
            // Iterate through the pairs of ranges, storing each value in each range
            // into the supplied span.  If they all won't fit, we give up and return 0.
            // Otherwise we return the number found.  Note that we don't bother to handle
            // the corner case where the last range's upper bound is LastChar (\uFFFF),
            // based on it a) complicating things, and b) it being really unlikely to
            // be part of a small set.
            int setLength = set[SetLengthIndex];
            int count = 0, evaluated = 0;
            for (int i = SetStartIndex; i < SetStartIndex + setLength; i += 2)
            {
                int curSetEnd = set[i + 1];
                for (int c = set[i]; c < curSetEnd; c++)
                {
                    // Keep track of how many characters we've checked. This could work
                    // just comparing count rather than evaluated, but we also want to
                    // limit how much work is done here, which we can do by constraining
                    // the number of checks to the size of the storage provided.
                    if (++evaluated > chars.Length)
                    {
                        return 0;
                    }
 
                    // If the set is all ranges but has a subtracted class,
                    // validate the char is actually in the set prior to storing it:
                    // it might be in the subtracted range.
                    if (hasSubtraction && !CharInClass((char)c, set))
                    {
                        continue;
                    }
 
                    Debug.Assert(count <= evaluated);
                    chars[count++] = (char)c;
                }
            }
 
            return count;
        }
 
        /// <summary>
        /// Determines whether two sets may overlap.
        /// </summary>
        /// <returns>false if the two sets do not overlap; true if they may.</returns>
        /// <remarks>
        /// If the method returns false, the caller can be sure the sets do not overlap.
        /// If the method returns true, it's still possible the sets don't overlap.
        /// </remarks>
        public static bool MayOverlap(string set1, string set2)
        {
            // If the sets are identical, there's obviously overlap.
            if (set1 == set2)
            {
                return true;
            }
 
            // If either set is all-inclusive, there's overlap by definition (unless
            // the other set is empty, but that's so rare it's not worth checking.)
            if (set1 == AnyClass || set2 == AnyClass)
            {
                return true;
            }
 
            // If one set is negated and the other one isn't, we're in one of two situations:
            // - The remainder of the sets are identical, in which case these are inverses of
            //   each other, and they don't overlap.
            // - The remainder of the sets aren't identical, in which case there's very likely
            //   overlap, and it's not worth spending more time investigating.
            bool set1Negated = IsNegated(set1);
            bool set2Negated = IsNegated(set2);
            if (set1Negated != set2Negated)
            {
                return !set1.AsSpan(1).SequenceEqual(set2.AsSpan(1));
            }
 
            // If the sets are negated, since they're not equal, there's almost certainly overlap.
            Debug.Assert(set1Negated == set2Negated);
            if (set1Negated)
            {
                return true;
            }
 
            // Special-case some known, common classes that don't overlap.
            if (KnownDistinctSets(set1, set2) ||
                KnownDistinctSets(set2, set1))
            {
                return false;
            }
 
            // If both sets are composed of only Unicode categories, we can compare most cases fairly easily.
            Span<UnicodeCategory> categories1 = stackalloc UnicodeCategory[16], categories2 = stackalloc UnicodeCategory[16];
            if (TryGetOnlyCategories(set1, categories1, out int numCategories1, out bool negated1) &&
                TryGetOnlyCategories(set2, categories2, out int numCategories2, out bool negated2))
            {
                // Check for the case of the sets being negated versions of the same single category,
                // e.g. \d and \D, in which case they don't overlap.
                if (numCategories1 == 1 && numCategories2 == 1 &&
                    categories1[0] == categories2[0] &&
                    negated1 != negated2)
                {
                    return false;
                }
 
                // Otherwise, if either is negated, just assume they may overlap.
                if (negated1 || negated2)
                {
                    return true;
                }
 
                // Check if any category is the same between the two. We've limited the number of elements in the spans
                // to a small number, so we just do the easy thing of comparing the full product.
                foreach (UnicodeCategory cat1 in categories1.Slice(0, numCategories1))
                {
                    foreach (UnicodeCategory cat2 in categories2.Slice(0, numCategories2))
                    {
                        if (cat1 == cat2)
                        {
                            return true;
                        }
                    }
                }
 
                // If we got here, the two sets are disjoint.
                return false;
            }
 
            // If set2 can be easily enumerated (e.g. no unicode categories), then enumerate it and
            // check if any of its members are in set1.  Otherwise, the same for set1.
            if (CanEasilyEnumerateSetContents(set2))
            {
                return MayOverlapByEnumeration(set1, set2);
            }
            else if (CanEasilyEnumerateSetContents(set1))
            {
                return MayOverlapByEnumeration(set2, set1);
            }
 
            // Assume that everything else might overlap.  In the future if it proved impactful, we could be more accurate here,
            // at the expense of more computation time.
            return true;
 
            static bool KnownDistinctSets(string set1, string set2) =>
                // Because of how the set strings are constructed, these known distinct sets aren't handled by our
                // more general UnicodeCategory logic.
                set1 switch
                {
                    SpaceClass => set2 is NotSpaceClass or DigitClass or WordClass,
                    ECMASpaceClass => set2 is NotECMASpaceClass or ECMADigitClass or ECMAWordClass,
                    WordClass => set2 is NotWordClass,
                    ECMAWordClass => set2 is NotECMAWordClass,
                    _ => false,
                };
 
            static bool MayOverlapByEnumeration(string set1, string set2)
            {
                Debug.Assert(!IsNegated(set1) && !IsNegated(set2));
                for (int i = SetStartIndex; i < SetStartIndex + set2[SetLengthIndex]; i += 2)
                {
                    int curSetEnd = set2[i + 1];
                    for (int c = set2[i]; c < curSetEnd; c++)
                    {
                        if (CharInClass((char)c, set1))
                        {
                            return true;
                        }
                    }
                }
 
                return false;
            }
        }
 
        /// <summary>
        /// Determines conservatively whether <paramref name="subset"/> is a subset of <paramref name="superset"/>
        /// (i.e. every character in subset is also in superset). Returns false if the subset relationship cannot be determined.
        /// </summary>
        public static bool IsSubsetOf(string subset, string superset)
        {
            // Identical sets are trivially subsets.
            if (subset == superset)
            {
                return true;
            }
 
            // If superset is the universal set, everything is a subset.
            if (superset == AnyClass)
            {
                return true;
            }
 
            // If subset can be easily enumerated, check that every character in it is also in superset.
            if (!IsNegated(subset) && CanEasilyEnumerateSetContents(subset))
            {
                for (int i = SetStartIndex; i < SetStartIndex + subset[SetLengthIndex]; i += 2)
                {
                    int curSetEnd = subset[i + 1];
                    for (int c = subset[i]; c < curSetEnd; c++)
                    {
                        if (!CharInClass((char)c, superset))
                        {
                            return false;
                        }
                    }
                }
 
                return true;
            }
 
            // If both sets are composed entirely of Unicode categories, check that all
            // categories in subset are also present in superset.
            Span<UnicodeCategory> categories1 = stackalloc UnicodeCategory[16], categories2 = stackalloc UnicodeCategory[16];
            if (TryGetOnlyCategories(subset, categories1, out int numCategories1, out bool negated1) && !negated1 &&
                TryGetOnlyCategories(superset, categories2, out int numCategories2, out bool negated2) && !negated2)
            {
                foreach (UnicodeCategory cat1 in categories1.Slice(0, numCategories1))
                {
                    bool found = false;
                    foreach (UnicodeCategory cat2 in categories2.Slice(0, numCategories2))
                    {
                        if (cat1 == cat2)
                        {
                            found = true;
                            break;
                        }
                    }
 
                    if (!found)
                    {
                        return false;
                    }
                }
 
                return true;
            }
 
            return false;
        }
 
        /// <summary>
        /// Gets whether the specified set is a named set with a reasonably small count
        /// of Unicode characters.
        /// </summary>
        /// <param name="set">The set description.</param>
        /// <param name="chars">The chars that make up the known set.</param>
        /// <param name="negated">Whether the <paramref name="chars"/> need to be negated.</param>
        /// <param name="description">A description suitable for use in C# code as a variable name.</param>
        public static bool IsUnicodeCategoryOfSmallCharCount(string set, [NotNullWhen(true)] out char[]? chars, out bool negated, [NotNullWhen(true)] out string? description)
        {
            switch (set)
            {
                case SpaceClass:
                case NotSpaceClass:
                    chars = s_whitespaceChars;
                    negated = set == NotSpaceClass;
                    description = "whitespace";
                    return true;
            }
 
            chars = default;
            negated = false;
            description = null;
            return false;
        }
 
        /// <summary>Gets whether the specified character participates in case conversion.</summary>
        /// <remarks>
        /// This method is used to perform operations as if they were case-sensitive even if they're
        /// specified as being case-insensitive.  Such a reduction can be applied when the only character
        /// that would lower-case to the one being searched for / compared against is that character itself.
        /// </remarks>
        public static bool ParticipatesInCaseConversion(int comparison)
        {
            Debug.Assert((uint)comparison <= char.MaxValue);
 
            switch (char.GetUnicodeCategory((char)comparison))
            {
                case UnicodeCategory.ClosePunctuation:
                case UnicodeCategory.ConnectorPunctuation:
                case UnicodeCategory.Control:
                case UnicodeCategory.DashPunctuation:
                case UnicodeCategory.DecimalDigitNumber:
                case UnicodeCategory.FinalQuotePunctuation:
                case UnicodeCategory.InitialQuotePunctuation:
                case UnicodeCategory.LineSeparator:
                case UnicodeCategory.OpenPunctuation:
                case UnicodeCategory.OtherNumber:
                case UnicodeCategory.OtherPunctuation:
                case UnicodeCategory.ParagraphSeparator:
                case UnicodeCategory.SpaceSeparator:
                    // All chars in these categories meet the criteria that the only way
                    // `char.ToLower(toTest, AnyCulture) == charInAboveCategory` is when
                    // toTest == charInAboveCategory.
                    return false;
 
                default:
                    // We don't know (without testing the character against every other
                    // character), so assume it does.
                    return true;
            }
        }
 
        /// <summary>Gets whether the specified span participates in case conversion.</summary>
        /// <remarks>The span participates in case conversion if any of its characters do.</remarks>
        public static bool ParticipatesInCaseConversion(ReadOnlySpan<char> s)
        {
            foreach (char c in s)
            {
                if (ParticipatesInCaseConversion(c))
                {
                    return true;
                }
            }
 
            return false;
        }
 
        /// <summary>Gets whether the set description string is for two ASCII letters that case to each other under OrdinalIgnoreCase rules.</summary>
        public static bool SetContainsAsciiOrdinalIgnoreCaseCharacter(string set, Span<char> twoChars)
        {
            Debug.Assert(twoChars.Length >= 2);
            return
                !IsNegated(set) &&
                GetSetChars(set, twoChars) == 2 &&
                twoChars[0] < 128 &&
                twoChars[1] < 128 &&
                twoChars[0] != twoChars[1] &&
                char.IsLetter(twoChars[0]) &&
                char.IsLetter(twoChars[1]) &&
                (twoChars[0] | 0x20) == (twoChars[1] | 0x20);
        }
 
        /// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
        /// <remarks>This may enumerate negated characters if the set is negated.  This will return false if the set has subtraction.</remarks>
        private static bool CanEasilyEnumerateSetContents(string set) =>
            CanEasilyEnumerateSetContents(set, out bool hasSubtraction) &&
            !hasSubtraction;
 
        /// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
        /// <remarks>This may enumerate negated characters if the set is negated, and it may be an overestimate if the set contains subtraction.</remarks>
        private static bool CanEasilyEnumerateSetContents(string set, out bool hasSubtraction)
        {
            hasSubtraction = IsSubtraction(set);
            return
                set.Length > SetStartIndex &&
                set[SetLengthIndex] > 0 &&
                set[SetLengthIndex] % 2 == 0 &&
                set[CategoryLengthIndex] == 0;
        }
 
        /// <summary>Provides results from <see cref="Analyze"/>.</summary>
        internal struct CharClassAnalysisResults
        {
            /// <summary>true if the set contains only ranges; false if it contains Unicode categories and/or subtraction.</summary>
            public bool OnlyRanges;
            /// <summary>true if we know for sure that the set contains only ASCII values; otherwise, false.</summary>
            /// <remarks>This can only be true if <see cref="OnlyRanges"/> is true.</remarks>
            public bool ContainsOnlyAscii;
            /// <summary>true if we know for sure that the set doesn't contain any ASCII values; otherwise, false.</summary>
            /// <remarks>This can only be true if <see cref="OnlyRanges"/> is true.</remarks>
            public bool ContainsNoAscii;
            /// <summary>true if we know for sure that all ASCII values are in the set; otherwise, false.</summary>
            /// <remarks>This can only be true if <see cref="OnlyRanges"/> is true.</remarks>
            public bool AllAsciiContained;
            /// <summary>true if we know for sure that all non-ASCII values are in the set; otherwise, false.</summary>
            /// <remarks>This can only be true if <see cref="OnlyRanges"/> is true.</remarks>
            public bool AllNonAsciiContained;
            /// <summary>The inclusive lower bound.</summary>
            /// <remarks>This is only valid if <see cref="OnlyRanges"/> is true.</remarks>
            public int LowerBoundInclusiveIfOnlyRanges;
            /// <summary>The exclusive upper bound.</summary>
            /// <remarks>This is only valid if <see cref="OnlyRanges"/> is true.</remarks>
            public int UpperBoundExclusiveIfOnlyRanges;
        }
 
        /// <summary>Analyzes the set to determine some basic properties that can be used to optimize usage.</summary>
        internal static CharClassAnalysisResults Analyze(string set)
        {
            bool isNegated = IsNegated(set);
 
            // The analysis is performed based entirely on ranges contained within the set.
            // Thus, we require that it can be "easily enumerated", meaning it contains only
            // ranges (and more specifically those with both the lower inclusive and upper
            // exclusive bounds specified). We also permit the set to contain a subtracted
            // character class, as for non-negated sets, that can only narrow what's permitted,
            // and the analysis can be performed on the overestimate of the set prior to subtraction.
            // However, negation is performed before subtraction, which means we can't trust
            // the ranges to inform AllNonAsciiContained and AllAsciiContained, as the subtraction
            // could create holes in those.  As such, while we can permit subtraction for non-negated
            // sets, for negated sets, we need to bail.
            if (!CanEasilyEnumerateSetContents(set, out bool hasSubtraction) ||
                (isNegated && hasSubtraction))
            {
                // We can't make any strong claims about the set.
                return default;
            }
 
            char firstValueInclusive = set[SetStartIndex];
            char lastValueExclusive = set[SetStartIndex + set[SetLengthIndex] - 1];
 
            if (isNegated)
            {
                // We're negated: if the upper bound of the range is ASCII, that means everything
                // above it is actually included, meaning all non-ASCII are in the class.
                // Similarly if the lower bound is non-ASCII, that means in a negated world
                // everything ASCII is included.
                Debug.Assert(!hasSubtraction);
                return new CharClassAnalysisResults
                {
                    OnlyRanges = true,
                    AllNonAsciiContained = lastValueExclusive <= 128,
                    AllAsciiContained = firstValueInclusive >= 128,
                    ContainsNoAscii = firstValueInclusive == 0 && set[SetStartIndex + 1] >= 128,
                    ContainsOnlyAscii = false,
                    LowerBoundInclusiveIfOnlyRanges = firstValueInclusive,
                    UpperBoundExclusiveIfOnlyRanges = lastValueExclusive,
                };
            }
 
            // If the upper bound is ASCII, that means everything included in the class is ASCII.
            // Similarly if the lower bound is non-ASCII, that means no ASCII is in the class.
            return new CharClassAnalysisResults
            {
                OnlyRanges = true,
                AllNonAsciiContained = false,
                AllAsciiContained = firstValueInclusive == 0 && set[SetStartIndex + 1] >= 128 && !hasSubtraction,
                ContainsOnlyAscii = lastValueExclusive <= 128,
                ContainsNoAscii = firstValueInclusive >= 128,
                LowerBoundInclusiveIfOnlyRanges = firstValueInclusive,
                UpperBoundExclusiveIfOnlyRanges = lastValueExclusive,
            };
        }
 
        internal static bool IsSubtraction(string charClass) =>
            charClass.Length > SetStartIndex +
            charClass[CategoryLengthIndex] +
            charClass[SetLengthIndex];
 
        internal static bool IsNegated(string set) => set[FlagsIndex] == 1;
 
        internal static bool IsNegated(string set, int setOffset) => set[FlagsIndex + setOffset] == 1;
 
        public static bool IsECMAWordChar(char ch) =>
            // According to ECMA-262, \s, \S, ., ^, and $ use Unicode-based interpretations of
            // whitespace and newline, while \d, \D\, \w, \W, \b, and \B use ASCII-only
            // interpretations of digit, word character, and word boundary.  In other words,
            // no special treatment of Unicode ZERO WIDTH NON-JOINER (ZWNJ U+200C) and
            // ZERO WIDTH JOINER (ZWJ U+200D) is required for ECMA word boundaries.
            ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter
            (((uint)ch - '0') < 10) || // digit
            ch == '_' || // underscore
            ch == '\u0130'; // latin capital letter I with dot above
 
        /// <summary>16 bytes, representing the chars 0 through 127, with a 1 for a bit where that char is a word char.</summary>
        private static ReadOnlySpan<byte> WordCharAsciiLookup =>
        [
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
            0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
        ];
 
        /// <summary>Mask of Unicode categories that combine to form [\\w]</summary>
        private const int WordCategoriesMask =
            1 << (int)UnicodeCategory.UppercaseLetter |
            1 << (int)UnicodeCategory.LowercaseLetter |
            1 << (int)UnicodeCategory.TitlecaseLetter |
            1 << (int)UnicodeCategory.ModifierLetter |
            1 << (int)UnicodeCategory.OtherLetter |
            1 << (int)UnicodeCategory.NonSpacingMark |
            1 << (int)UnicodeCategory.DecimalDigitNumber |
            1 << (int)UnicodeCategory.ConnectorPunctuation;
 
        /// <summary>Determines whether a character is considered a word character for the purposes of testing the \w set.</summary>
        public static bool IsWordChar(char ch)
        {
            // This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
            // returns true for \u200c and \u200d.
 
            // Bitmap for whether each character 0 through 127 is in [\\w]
            ReadOnlySpan<byte> ascii = WordCharAsciiLookup;
 
            // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
            int chDiv8 = ch >> 3;
            return (uint)chDiv8 < (uint)ascii.Length ?
                (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
        }
 
        /// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
        public static bool IsBoundaryWordChar(char ch)
        {
            // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
            // RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
            // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
            // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
            const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
 
            // Bitmap for whether each character 0 through 127 is in [\\w]
            ReadOnlySpan<byte> ascii = WordCharAsciiLookup;
 
            // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
            int chDiv8 = ch >> 3;
            return (uint)chDiv8 < (uint)ascii.Length ?
                (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
                (IsWordCategory(CharUnicodeInfo.GetUnicodeCategory(ch)) ||
                 (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner));
        }
 
        private static bool IsWordCategory(UnicodeCategory category) =>
            (WordCategoriesMask & (1 << (int)category)) != 0;
 
        /// <summary>Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'.</summary>
        /// <remarks>This isn't specific to RegexCharClass; it's just a convenient place to host it.</remarks>
        public static bool DifferByOneBit(char a, char b, out int mask)
        {
            mask = a ^ b;
            return BitOperations.IsPow2(mask);
        }
 
        /// <summary>Determines a character's membership in a character class (via the string representation of the class).</summary>
        /// <param name="ch">The character.</param>
        /// <param name="set">The string representation of the character class.</param>
        /// <param name="asciiLazyCache">A lazily-populated cache for ASCII results stored in a 256-bit array.</param>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache)
        {
            // The uint[] contains 8 ints, or 256 bits.  These are laid out as pairs, where the first bit in the pair
            // says whether the second bit in the pair has already been computed.  Once a value is computed, it's never
            // changed, so since Int32s are written/read atomically, we can trust the value bit if we see that the known bit
            // has been set.  If the known bit hasn't been set, then we proceed to look it up, and then swap in the result.
            const int CacheArrayLength = 8;
            Debug.Assert(asciiLazyCache is null || asciiLazyCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters");
 
            // If the value is ASCII and already has an answer for this value, use it.
            if (asciiLazyCache is uint[] cache)
            {
                int index = ch >> 4;
                if ((uint)index < (uint)cache.Length)
                {
                    Debug.Assert(ch < 128);
                    uint current = cache[index];
                    uint bit = 1u << ((ch & 0xF) << 1);
                    if ((current & bit) != 0)
                    {
                        return (current & (bit << 1)) != 0;
                    }
                }
            }
 
            // For ASCII, lazily initialize. For non-ASCII, just compute the value.
            return ch < 128 ?
                InitializeValue(ch, set, ref asciiLazyCache) :
                CharInClassIterative(ch, set, 0);
 
            static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache)
            {
                // (After warm-up, we should find ourselves rarely getting here.)
                Debug.Assert(ch < 128);
 
                // Compute the result and determine which bits to write back to the array and "or" the bits back in a thread-safe manner.
                bool isInClass = CharInClass(ch, set);
                uint bitsToSet = 1u << ((ch & 0xF) << 1);
                if (isInClass)
                {
                    bitsToSet |= bitsToSet << 1;
                }
 
                uint[]? cache = asciiLazyCache ?? Interlocked.CompareExchange(ref asciiLazyCache, new uint[CacheArrayLength], null) ?? asciiLazyCache;
#if NET
                Interlocked
#else
                InterlockedExtensions
#endif
                    .Or(ref cache[ch >> 4], bitsToSet);
 
                // Return the computed value.
                return isInClass;
            }
        }
 
        /// <summary>
        /// Determines a character's membership in a character class (via the string representation of the class).
        /// </summary>
        public static bool CharInClass(char ch, string set) =>
            CharInClassIterative(ch, set, 0);
 
        private static bool CharInClassIterative(char ch, string set, int start)
        {
            bool inClass = false;
 
            while (true)
            {
                int setLength = set[start + SetLengthIndex];
                int categoryLength = set[start + CategoryLengthIndex];
                int endPosition = start + SetStartIndex + setLength + categoryLength;
 
                if (CharInClassInternal(ch, set, start, setLength, categoryLength) == IsNegated(set, start))
                {
                    break;
                }
 
                inClass = !inClass;
 
                if (set.Length <= endPosition)
                {
                    break;
                }
 
                start = endPosition;
            }
 
            return inClass;
        }
 
        /// <summary>
        /// Determines a character's membership in a character class (via the
        /// string representation of the class).
        /// </summary>
        private static bool CharInClassInternal(char ch, string set, int start, int setLength, int categoryLength)
        {
            int min = start + SetStartIndex;
            int max = min + setLength;
 
            while (min != max)
            {
                int mid = (min + max) >> 1;
                if (ch < set[mid])
                {
                    max = mid;
                }
                else
                {
                    min = mid + 1;
                }
            }
 
            // The starting position of the set within the character class determines
            // whether what an odd or even ending position means.  If the start is odd,
            // an *even* ending position means the character was in the set.  With recursive
            // subtractions in the mix, the starting position = start+SetStartIndex.  Since we know that
            // SetStartIndex is odd, we can simplify it out of the equation.  But if it changes we need to
            // reverse this check.
            Debug.Assert((SetStartIndex & 0x1) == 1, "If SetStartIndex is not odd, the calculation below this will be reversed");
            if ((min & 0x1) == (start & 0x1))
            {
                return true;
            }
 
            if (categoryLength == 0)
            {
                return false;
            }
 
            return CharInCategory(ch, set.AsSpan(SetStartIndex + start + setLength, categoryLength));
        }
 
        private static bool CharInCategory(char ch, ReadOnlySpan<char> categorySetSegment)
        {
            UnicodeCategory chcategory = char.GetUnicodeCategory(ch);
 
            for (int i = 0; i < categorySetSegment.Length; i++)
            {
                int curcat = (short)categorySetSegment[i];
 
                if (curcat == 0)
                {
                    // zero is our marker for a group of categories - treated as a unit
                    if (CharInCategoryGroup(chcategory, categorySetSegment, ref i))
                    {
                        return true;
                    }
                }
                else if (curcat > 0)
                {
                    // greater than zero is a positive case
 
                    if (curcat == SpaceConst)
                    {
                        if (char.IsWhiteSpace(ch))
                        {
                            return true;
                        }
                    }
                    else if (chcategory == (UnicodeCategory)(curcat - 1))
                    {
                        return true;
                    }
                }
                else
                {
                    // less than zero is a negative case
                    if (curcat == NotSpaceConst)
                    {
                        if (!char.IsWhiteSpace(ch))
                        {
                            return true;
                        }
                    }
                    else if (chcategory != (UnicodeCategory)(-1 - curcat))
                    {
                        return true;
                    }
                }
            }
 
            return false;
        }
 
        /// <summary>
        /// This is used for categories which are composed of other categories - L, N, Z, W...
        /// These groups need special treatment when they are negated
        /// </summary>
        private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan<char> category, ref int i)
        {
            int pos = i + 1;
            int curcat = (short)category[pos];
            bool result;
 
            if (curcat > 0)
            {
                // positive case - the character must be in ANY of the categories in the group
                result = false;
                do
                {
                    result |= chcategory == (UnicodeCategory)(curcat - 1);
                    curcat = (short)category[++pos];
                }
                while (curcat != 0);
            }
            else
            {
                // negative case - the character must be in NONE of the categories in the group
                Debug.Assert(curcat < 0);
                result = true;
                do
                {
                    result &= chcategory != (UnicodeCategory)(-1 - curcat);
                    curcat = (short)category[++pos];
                }
                while (curcat != 0);
            }
 
            i = pos;
            return result;
        }
 
        public static RegexCharClass Parse(string charClass)
        {
            RegexCharClass? outermost = null;
            RegexCharClass? current = null;
 
            int pos = 0;
            while (true)
            {
                int setLength = charClass[pos + SetLengthIndex];
                int categoryLength = charClass[pos + CategoryLengthIndex];
                int endPosition = pos + SetStartIndex + setLength + categoryLength;
 
                List<(char First, char Last)>? ranges = ComputeRanges(charClass.AsSpan(pos));
 
                StringBuilder? categoriesBuilder = null;
                if (categoryLength > 0)
                {
                    int end = pos + SetStartIndex + setLength;
                    categoriesBuilder = new StringBuilder().Append(charClass.AsSpan(end, categoryLength));
                }
 
                var level = new RegexCharClass(IsNegated(charClass, pos), ranges, categoriesBuilder, subtraction: null);
 
                if (outermost is null)
                {
                    outermost = level;
                }
                else
                {
                    current!.AddSubtraction(level);
                }
                current = level;
 
                if (charClass.Length <= endPosition)
                {
                    break;
                }
 
                pos = endPosition;
            }
 
            return outermost!;
        }
 
        /// <summary>Computes a list of all of the character ranges in the set string.</summary>
        public static List<(char First, char Last)>? ComputeRanges(ReadOnlySpan<char> set)
        {
            int setLength = set[SetLengthIndex];
            int i = SetStartIndex;
            int end = i + setLength;
 
            List<(char First, char Last)>? ranges = null;
            if (setLength > 0)
            {
                ranges = new List<(char First, char Last)>(setLength);
                while (i < end)
                {
                    char first = set[i];
                    i++;
 
                    char last = i < end ? (char)(set[i] - 1) : LastChar;
                    i++;
 
                    ranges.Add((first, last));
                }
            }
 
            return ranges;
        }
 
        /// <summary>Cache of character class strings for single ASCII characters.</summary>
        private static readonly string[] s_asciiStrings = new string[128];
        /// <summary>Cache of character class strings for pairs of upper/lower-case ASCII letters.</summary>
        private static readonly string[] s_asciiLetterPairStrings = new string[26];
 
        /// <summary>Creates a set string for a single character.</summary>
        /// <param name="c">The character for which to create the set.</param>
        /// <returns>The create set string.</returns>
        public static string OneToStringClass(char c)
            => CharsToStringClass([c]);
 
        internal static string CharsToStringClass(ReadOnlySpan<char> chars)
        {
#if DEBUG
            // Make sure they're all sorted with no duplicates
            for (int index = 0; index < chars.Length - 1; index++)
            {
                Debug.Assert(chars[index] < chars[index + 1]);
            }
#endif
 
            switch (chars.Length)
            {
                case 0:
                    // If there aren't any chars, just return an empty class.
                    return EmptyClass;
 
                case 1:
                    // Special-case ASCII characters to avoid the computation/allocation in this very common case.
                    if (chars[0] < 128)
                    {
                        string[] asciiStrings = s_asciiStrings;
                        if (chars[0] < asciiStrings.Length)
                        {
                            return asciiStrings[chars[0]] ??= $"\0\u0002\0{chars[0]}{(char)(chars[0] + 1)}";
                        }
                    }
                    break;
 
                case 2:
                    // Special-case cased ASCII letter pairs to avoid the computation/allocation in this very common case.
                    int masked0 = chars[0] | 0x20;
                    if ((uint)(masked0 - 'a') <= 'z' - 'a' && masked0 == (chars[1] | 0x20))
                    {
                        return s_asciiLetterPairStrings[masked0 - 'a'] ??= $"\0\u0004\0{(char)(masked0 & ~0x20)}{(char)((masked0 & ~0x20) + 1)}{(char)masked0}{(char)(masked0 + 1)}";
                    }
                    break;
            }
 
            // Count how many characters there actually are.  All but the very last possible
            // char value will have two characters, one for the inclusive beginning of range
            // and one for the exclusive end of range.
            int count = chars.Length * 2;
            if (chars[chars.Length - 1] == LastChar)
            {
                count--;
            }
 
            // Get the pointer/length of the span to be able to pass it into string.Create.
            ReadOnlySpan<char> tmpChars = chars; // avoid address exposing the span and impacting the other code in the method that uses it
#if NET
            return string.Create(SetStartIndex + count, tmpChars, static (span, chars) =>
            {
                // Fill in the set string
                span[FlagsIndex] = (char)0;
                span[SetLengthIndex] = (char)(span.Length - SetStartIndex);
                span[CategoryLengthIndex] = (char)0;
                int i = SetStartIndex;
                foreach (char c in chars)
                {
                    span[i++] = c;
                    if (c != LastChar)
                    {
                        span[i++] = (char)(c + 1);
                    }
                }
                Debug.Assert(i == span.Length);
            });
#else
            unsafe
            {
                return StringExtensions.Create(SetStartIndex + count, (IntPtr)(&tmpChars), static (span, charsPtr) =>
                {
                    // Fill in the set string
                    span[FlagsIndex] = (char)0;
                    span[SetLengthIndex] = (char)(span.Length - SetStartIndex);
                    span[CategoryLengthIndex] = (char)0;
                    int i = SetStartIndex;
                    ReadOnlySpan<char> chars = *(ReadOnlySpan<char>*)charsPtr;
                    foreach (char c in chars)
                    {
                        span[i++] = c;
                        if (c != LastChar)
                        {
                            span[i++] = (char)(c + 1);
                        }
                    }
                    Debug.Assert(i == span.Length);
                });
            }
#endif
        }
 
 
        /// <summary>
        /// Constructs the string representation of the class.
        /// </summary>
        public string ToStringClass()
        {
            var vsb = new ValueStringBuilder(stackalloc char[256]);
 
            RegexCharClass? current = this;
            do
            {
                current.Canonicalize();
 
                int initialLength = vsb.Length;
                int categoriesLength = current._categories?.Length ?? 0;
                Span<char> headerSpan = vsb.AppendSpan(SetStartIndex);
                headerSpan[FlagsIndex] = (char)(current._negate ? 1 : 0);
                headerSpan[SetLengthIndex] = '\0'; // (will be replaced once we know how long a range we've added)
                headerSpan[CategoryLengthIndex] = (char)categoriesLength;
 
                // Append ranges
                List<(char First, char Last)>? rangelist = current._rangelist;
                if (rangelist != null)
                {
                    for (int i = 0; i < rangelist.Count; i++)
                    {
                        (char First, char Last) currentRange = rangelist[i];
                        vsb.Append(currentRange.First);
                        if (currentRange.Last != LastChar)
                        {
                            vsb.Append((char)(currentRange.Last + 1));
                        }
                    }
                }
 
                // Update the range length.  The ValueStringBuilder may have already had some
                // contents (if this is a subtactor), so we need to offset by the initial length.
                vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex);
 
                // Append categories
                if (categoriesLength != 0)
                {
                    foreach (ReadOnlyMemory<char> chunk in current._categories!.GetChunks())
                    {
                        vsb.Append(chunk.Span);
                    }
                }
 
                current = current._subtractor;
            }
            while (current is not null);
 
            return vsb.ToString();
        }
 
        /// <summary>
        /// Logic to reduce a character class to a unique, sorted form.
        /// </summary>
        private void Canonicalize()
        {
            List<(char First, char Last)>? rangelist = _rangelist;
            if (rangelist != null)
            {
                // Find and eliminate overlapping or abutting ranges.
                if (rangelist.Count > 1)
                {
                    rangelist.Sort((x, y) => x.First.CompareTo(y.First));
 
                    bool done = false;
                    int j = 0;
 
                    for (int i = 1; ; i++)
                    {
                        char last;
                        for (last = rangelist[j].Last; ; i++)
                        {
                            if (i == rangelist.Count || last == LastChar)
                            {
                                done = true;
                                break;
                            }
 
                            (char First, char Last) currentRange;
                            if ((currentRange = rangelist[i]).First > last + 1)
                            {
                                break;
                            }
 
                            if (last < currentRange.Last)
                            {
                                last = currentRange.Last;
                            }
                        }
 
                        rangelist[j] = (rangelist[j].First, last);
 
                        j++;
 
                        if (done)
                        {
                            break;
                        }
 
                        if (j < i)
                        {
                            rangelist[j] = rangelist[i];
                        }
                    }
 
                    rangelist.RemoveRange(j, rangelist.Count - j);
                }
 
                // If the class now represents a single negated range, but does so by including every
                // other character, invert it to produce a normalized form with a single range.  This
                // is valuable for subsequent optimizations in most of the engines.
                if (!_negate &&
                    _subtractor is null &&
                    (_categories is null || _categories.Length == 0))
                {
                    if (rangelist.Count == 2)
                    {
                        // There are two ranges in the list.  See if there's one missing range between them.
                        // Such a range might be as small as a single character.
                        if (rangelist[0].First == 0 &&
                            rangelist[1].Last == LastChar &&
                            rangelist[0].Last < rangelist[1].First - 1)
                        {
                            rangelist[0] = ((char)(rangelist[0].Last + 1), (char)(rangelist[1].First - 1));
                            rangelist.RemoveAt(1);
                            _negate = true;
                        }
                    }
                    else if (rangelist.Count == 1)
                    {
                        if (rangelist[0].First == 0)
                        {
                            // There's only one range in the list.  Does it include everything but the last char?
                            if (rangelist[0].Last == LastChar - 1)
                            {
                                rangelist[0] = (LastChar, LastChar);
                                _negate = true;
                            }
                        }
                        else if (rangelist[0].First == 1)
                        {
                            // Or everything but the first char?
                            if (rangelist[0].Last == LastChar)
                            {
                                rangelist[0] = ('\0', '\0');
                                _negate = true;
                            }
                        }
                    }
                }
 
                // If the class now has a range that includes everything, and if it doesn't have subtraction,
                // we can remove all of its categories, as they're duplicative (the set already includes everything).
                if (!_negate &&
                    _subtractor is null &&
                    _categories?.Length > 0 &&
                    rangelist.Count == 1 && rangelist[0].First == 0 && rangelist[0].Last == LastChar)
                {
                    _categories.Clear();
                }
 
                // If there's only a single character omitted from ranges, if there's no subtractor, and if there are categories,
                // see if that character is in the categories.  If it is, then we can replace whole thing with a complete "any" range.
                // If it's not, then we can remove the categories, as they're only duplicating the rest of the range, turning the set
                // into a "not one". This primarily helps in the case of a synthesized set from analysis that ends up combining '.' with
                // categories, as we want to reduce that set down to either [^\n] or [\0-\uFFFF]. (This can be extrapolated to any number
                // of missing characters; in fact, categories in general are superfluous and the entire set can be represented as ranges.
                // But categories serve as a space optimization, and we strike a balance between testing many characters and the time/complexity
                // it takes to do so.  Thus, we limit this to the common case of a single missing character.)
                if (!_negate &&
                    _subtractor is null &&
                    _categories?.Length > 0 &&
                    rangelist.Count == 2 && rangelist[0].First == 0 && rangelist[0].Last + 2 == rangelist[1].First && rangelist[1].Last == LastChar)
                {
                    var vsb = new ValueStringBuilder(stackalloc char[256]);
                    foreach (ReadOnlyMemory<char> chunk in _categories!.GetChunks())
                    {
                        vsb.Append(chunk.Span);
                    }
 
                    if (CharInCategory((char)(rangelist[0].Last + 1), vsb.AsSpan()))
                    {
                        rangelist.RemoveAt(1);
                        rangelist[0] = ('\0', LastChar);
                    }
                    else
                    {
                        _negate = true;
                        rangelist.RemoveAt(1);
                        char notOne = (char)(rangelist[0].Last + 1);
                        rangelist[0] = (notOne, notOne);
                    }
                    _categories.Clear();
 
                    vsb.Dispose();
                }
            }
        }
 
        private static ReadOnlySpan<char> RangesFromProperty(string capname, bool invert, string pattern, int currentPos)
        {
            int min = 0;
            int max = s_propTable.Length;
            while (min != max)
            {
                int mid = (min + max) / 2;
                int res = string.Compare(capname, s_propTable[mid][0], StringComparison.Ordinal);
                if (res < 0)
                {
                    max = mid;
                }
                else if (res > 0)
                {
                    min = mid + 1;
                }
                else
                {
                    string set = s_propTable[mid][1];
                    Debug.Assert(!string.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table");
                    return
                        !invert ? set.AsSpan() :
                        set[0] == '\0' ? set.AsSpan(1) :
                        ("\0" + set).AsSpan();
                }
            }
 
            throw new RegexParseException(RegexParseError.UnrecognizedUnicodeProperty, currentPos,
                SR.Format(SR.MakeException, pattern, currentPos, SR.Format(SR.UnrecognizedUnicodeProperty, capname)));
        }
 
#if DEBUG || !SYSTEM_TEXT_REGULAREXPRESSIONS
        private static readonly string[] CategoryIdToName = PopulateCategoryIdToName();
 
        private static string[] PopulateCategoryIdToName()
        {
            // Populate category reverse lookup used for diagnostic output
 
            var temp = new List<KeyValuePair<string, string>>(s_definedCategories);
            temp.RemoveAll(kvp => kvp.Value.Length != 1);
            temp.Sort((kvp1, kvp2) => ((short)kvp1.Value[0]).CompareTo((short)kvp2.Value[0]));
            return temp.ConvertAll(kvp => kvp.Key).ToArray();
        }
 
        /// <summary>
        /// Produces a human-readable description for a set string.
        /// </summary>
        /// <param name="set">The set string to describe.</param>
        /// <param name="forceBrackets">Whether to force brackets around the description even for single characters.</param>
        public static string DescribeSet(string set, bool forceBrackets = false)
        {
            int setLength = set[SetLengthIndex];
            int categoryLength = set[CategoryLengthIndex];
            int endPosition = SetStartIndex + setLength + categoryLength;
            bool negated = IsNegated(set);
            Span<char> scratch = stackalloc char[32];
 
            // Special-case set of a single character to output that character without set square brackets.
            if (!forceBrackets && // brackets not forced
                !negated && // no negation
                categoryLength == 0 && // no categories
                endPosition >= set.Length && // no subtraction
                setLength == 2 && // don't bother handling the case of the single character being 0xFFFF, in which case setLength would be 1
                set[SetStartIndex] + 1 == set[SetStartIndex + 1])
            {
                return DescribeChar(set[SetStartIndex]);
            }
 
            int index = SetStartIndex;
            char ch1;
            char ch2;
            StringBuilder desc = new StringBuilder().Append('[');
 
            void RenderRanges()
            {
                int rangesEnd = SetStartIndex + set[SetLengthIndex];
                while (index < rangesEnd)
                {
                    ch1 = set[index];
                    if (index + 1 < rangesEnd)
                    {
                        ch2 = (char)(set[index + 1] - 1);
                        index += 2;
                    }
                    else
                    {
                        ch2 = LastChar;
                        index++;
                    }
 
                    desc.Append(DescribeChar(ch1));
 
                    if (ch2 != ch1)
                    {
                        if (ch1 + 1 != ch2)
                        {
                            desc.Append('-');
                        }
 
                        desc.Append(DescribeChar(ch2));
                    }
                }
            }
 
            // Special-case sets where the description will be more succinct by rendering it as negated, e.g. where
            // there are fewer gaps between ranges than there are ranges.  This is the case when the first range
            // includes \0 and the last range includes 0xFFFF, and typically occurs for sets that were actually
            // initially negated but ended up as non-negated from various transforms along the way.
            if (categoryLength == 0 && // no categories
                endPosition >= set.Length && // no subtraction
                setLength % 2 == 1 && // odd number of values because the last range won't include an upper bound
                set[index] == 0)
            {
                // We now have an odd number of values structures as:
                //     0,end0,start1,end1,start2,end2,...,startN
                // Rather than walking the pairs starting from index 0, we walk pairs starting from index 1 (creating a range from end0 to start1),
                // since we're creating ranges from the gaps.
                index++;
                desc.Append('^');
                RenderRanges();
                return desc.Append(']').ToString();
            }
 
            if (negated)
            {
                desc.Append('^');
            }
 
            RenderRanges();
 
            while (index < SetStartIndex + set[SetLengthIndex] + set[CategoryLengthIndex])
            {
                ch1 = set[index];
                if (ch1 == 0)
                {
                    const char GroupChar = '\0';
                    int lastindex = set.IndexOf(GroupChar, index + 1);
                    if (lastindex != -1)
                    {
                        ReadOnlySpan<char> group = set.AsSpan(index, lastindex - index + 1);
                        switch (group)
                        {
                            case WordCategories:
                                desc.Append(@"\w");
                                break;
 
                            case NotWordCategories:
                                desc.Append(@"\W");
                                break;
 
                            default:
                                // The inverse of a group as created by AddCategoryFromName simply negates every character as a 16-bit value.
                                Span<char> invertedGroup = group.Length <= scratch.Length ? scratch.Slice(0, group.Length) : new char[group.Length];
                                for (int i = 0; i < group.Length; i++)
                                {
                                    invertedGroup[i] = (char)-(short)group[i];
                                }
 
                                // Determine whether the group is a known Unicode category, e.g. \p{Mc}, or group of categories, e.g. \p{L},
                                // or the inverse of those.
                                foreach (KeyValuePair<string, string> kvp in s_definedCategories)
                                {
                                    bool equalsGroup = group.SequenceEqual(kvp.Value.AsSpan());
                                    if (equalsGroup || invertedGroup.SequenceEqual(kvp.Value.AsSpan()))
                                    {
                                        desc.Append(equalsGroup ? @"\p{" : @"\P{").Append(kvp.Key).Append('}');
                                        break;
                                    }
                                }
                                break;
                        }
 
                        index = lastindex;
                    }
                }
                else
                {
                    desc.Append(DescribeCategory(ch1));
                }
 
                index++;
            }
 
            if (set.Length > endPosition)
            {
                desc.Append('-').Append(DescribeSet(set.Substring(endPosition), forceBrackets: true));
            }
 
            return desc.Append(']').ToString();
        }
 
        /// <summary>Produces a human-readable description for a single character.</summary>
        public static string DescribeChar(char ch) =>
            ch switch
            {
                '\0' => @"\0",
                '\a' => "\\a",
                '\b' => "\\b",
                '\t' => "\\t",
                '\r' => "\\r",
                '\v' => "\\v",
                '\f' => "\\f",
                '\n' => "\\n",
                '\\' => "\\\\",
                '-'  => "\\-",
                >= ' ' and <= '~' => ch.ToString(),
                _ => $"\\u{(uint)ch:X4}"
            };
 
        private static string DescribeCategory(char ch) =>
            (short)ch switch
            {
                SpaceConst => @"\s",
                NotSpaceConst => @"\S",
                (short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\d",
                -(short)(UnicodeCategory.DecimalDigitNumber + 1) => @"\D",
                < 0 => $"\\P{{{CategoryIdToName[-(short)ch - 1]}}}",
                _ => $"\\p{{{CategoryIdToName[ch - 1]}}}",
            };
#endif
    }
}