RegexNodeConverter.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Runtime.InteropServices;
using System.Threading;
 
namespace System.Text.RegularExpressions.Symbolic
{
    /// <summary>Provides functionality to convert <see cref="RegexNode"/>s to corresponding <see cref="SymbolicRegexNode{S}"/>s.</summary>
    /// <remarks>Constructs a regex to symbolic finite automata converter</remarks>
    internal sealed class RegexNodeConverter(SymbolicRegexBuilder<BDD> builder, Hashtable? captureSparseMapping)
    {
        /// <summary>Capture information.</summary>
        private readonly Hashtable? _captureSparseMapping = captureSparseMapping;
        /// <summary>The builder to use to create the <see cref="SymbolicRegexNode{S}"/> nodes.</summary>
        internal readonly SymbolicRegexBuilder<BDD> _builder = builder;
 
        /// <summary>Cache of BDDs created to represent <see cref="RegexCharClass"/> set strings.</summary>
        /// <remarks>This cache is useful iff the same character class is used multiple times in the same regex, but that's fairly common.</remarks>
        private Dictionary<string, BDD>? _setBddCache;
 
        /// <summary>Converts the root <see cref="RegexNode"/> into its corresponding <see cref="SymbolicRegexNode{S}"/>.</summary>
        /// <param name="root">The root node to convert.</param>
        /// <returns>The generated <see cref="SymbolicRegexNode{S}"/> that corresponds to the supplied <paramref name="root"/>.</returns>
        internal SymbolicRegexNode<BDD> ConvertToSymbolicRegexNode(RegexNode root)
        {
            Debug.Assert(_builder is not null);
 
            // Create the root list that will store the built-up result.
            DoublyLinkedList<SymbolicRegexNode<BDD>> rootResult = new();
 
            // Create a stack to be processed in order to process iteratively rather than recursively, and push the root on.
            Stack<(RegexNode Node, DoublyLinkedList<SymbolicRegexNode<BDD>> Result, DoublyLinkedList<SymbolicRegexNode<BDD>>[]? ChildResults)> stack = new();
            stack.Push((root, rootResult, CreateChildResultArray(root.ChildCount())));
 
            // Continue to iterate until the stack is empty, popping the next item on each iteration.
            // Some popped items may be pushed back on as part of processing.
            while (stack.TryPop(out (RegexNode Node, DoublyLinkedList<SymbolicRegexNode<BDD>> Result, DoublyLinkedList<SymbolicRegexNode<BDD>>[]? ChildResults) popped))
            {
                RegexNode node = popped.Node;
                DoublyLinkedList<SymbolicRegexNode<BDD>> result = popped.Result;
                DoublyLinkedList<SymbolicRegexNode<BDD>>[]? childResults = popped.ChildResults;
                Debug.Assert(childResults is null || childResults.Length != 0);
 
                if (childResults is null || childResults[0] is null)
                {
                    // Child nodes have not been converted yet
                    // Handle each node kind as-is appropriate.
                    switch (node.Kind)
                    {
                        // Singletons and multis
 
                        case RegexNodeKind.One:
                            result.AddLast(_builder.CreateSingleton(_builder._charSetSolver.CreateBDDFromChar(node.Ch)));
                            break;
 
                        case RegexNodeKind.Notone:
                            result.AddLast(_builder.CreateSingleton(_builder._solver.Not(_builder._charSetSolver.CreateBDDFromChar(node.Ch))));
                            break;
 
                        case RegexNodeKind.Set:
                            result.AddLast(ConvertSet(node));
                            break;
 
                        case RegexNodeKind.Multi:
                            {
                                // Create a BDD for each character in the string and concatenate them.
                                string? str = node.Str;
                                Debug.Assert(str is not null);
                                foreach (char c in str)
                                {
                                    result.AddLast(_builder.CreateSingleton(_builder._charSetSolver.CreateBDDFromChar(c)));
                                }
                                break;
                            }
 
                        // The following five cases are the only node kinds that are pushed twice:
                        // Joins, general loops, and supported captures
 
                        case RegexNodeKind.Concatenate:
                        case RegexNodeKind.Alternate:
                        case RegexNodeKind.Loop:
                        case RegexNodeKind.Lazyloop:
                        case RegexNodeKind.Capture when node.N == -1: // N == -1 because balancing groups (which have N >= 0) aren't supported
                            {
                                Debug.Assert(childResults is not null && childResults.Length == node.ChildCount());
 
                                // Push back the temporarily popped item. Next time this work item is seen, its ChildResults list will be ready.
                                stack.Push(popped);
 
                                // Push all the children to be converted
                                for (int i = 0; i < node.ChildCount(); ++i)
                                {
                                    childResults[i] = new DoublyLinkedList<SymbolicRegexNode<BDD>>();
                                    stack.Push((node.Child(i), childResults[i], CreateChildResultArray(node.Child(i).ChildCount())));
                                }
                                break;
                            }
 
                        // Specialized loops
 
                        case RegexNodeKind.Oneloop:
                        case RegexNodeKind.Onelazy:
                        case RegexNodeKind.Notoneloop:
                        case RegexNodeKind.Notonelazy:
                            {
                                // Create a BDD that represents the character, then create a loop around it.
                                BDD bdd = _builder._charSetSolver.CreateBDDFromChar(node.Ch);
                                if (node.IsNotoneFamily)
                                {
                                    bdd = _builder._solver.Not(bdd);
                                }
                                result.AddLast(_builder.CreateLoop(_builder.CreateSingleton(bdd), node.Kind is RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy, node.M, node.N));
                                break;
                            }
 
                        case RegexNodeKind.Setloop:
                        case RegexNodeKind.Setlazy:
                            {
                                // Create a BDD that represents the set string, then create a loop around it.
                                string? set = node.Str;
                                Debug.Assert(set is not null);
                                BDD setBdd = CreateBDDFromSetString(set);
                                result.AddLast(_builder.CreateLoop(_builder.CreateSingleton(setBdd), node.Kind == RegexNodeKind.Setlazy, node.M, node.N));
                                break;
                            }
 
                        case RegexNodeKind.Empty:
                        case RegexNodeKind.UpdateBumpalong: // UpdateBumpalong is a directive relevant only to backtracking and can be ignored just like Empty
                            break;
 
                        case RegexNodeKind.Nothing:
                            result.AddLast(_builder._nothing);
                            break;
 
                        // Anchors
 
                        case RegexNodeKind.Beginning:
                            result.AddLast(_builder.BeginningAnchor);
                            break;
 
                        case RegexNodeKind.Bol:
                            EnsureNewlinePredicateInitialized();
                            result.AddLast(_builder.BolAnchor);
                            break;
 
                        case RegexNodeKind.End:  // \z anchor
                            result.AddLast(_builder.EndAnchor);
                            break;
 
                        case RegexNodeKind.EndZ: // \Z anchor
                            EnsureNewlinePredicateInitialized();
                            result.AddLast(_builder.EndAnchorZ);
                            break;
 
                        case RegexNodeKind.Eol:
                            EnsureNewlinePredicateInitialized();
                            result.AddLast(_builder.EolAnchor);
                            break;
 
                        case RegexNodeKind.Boundary:
                            EnsureWordLetterPredicateInitialized();
                            result.AddLast(_builder.BoundaryAnchor);
                            break;
 
                        case RegexNodeKind.NonBoundary:
                            EnsureWordLetterPredicateInitialized();
                            result.AddLast(_builder.NonBoundaryAnchor);
                            break;
 
                        // Unsupported
 
                        default:
                            throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingConflictingExpression, node.Kind switch
                            {
                                RegexNodeKind.Atomic or RegexNodeKind.Setloopatomic or RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic => SR.ExpressionDescription_AtomicSubexpressions,
                                RegexNodeKind.Backreference => SR.ExpressionDescription_Backreference,
                                RegexNodeKind.BackreferenceConditional => SR.ExpressionDescription_Conditional,
                                RegexNodeKind.Capture => SR.ExpressionDescription_BalancingGroup,
                                RegexNodeKind.ExpressionConditional => SR.ExpressionDescription_IfThenElse,
                                RegexNodeKind.NegativeLookaround => SR.ExpressionDescription_NegativeLookaround,
                                RegexNodeKind.PositiveLookaround => SR.ExpressionDescription_PositiveLookaround,
                                RegexNodeKind.Start => SR.ExpressionDescription_ContiguousMatches,
                                _ => UnexpectedNodeType(node)
                            }));
 
                            static string UnexpectedNodeType(RegexNode node)
                            {
                                // The default should never arise, since other node types are either supported
                                // or have been removed (e.g. Group) from the final parse tree.
                                string description = $"Unexpected ({nameof(RegexNodeKind)}: {node.Kind})";
                                Debug.Fail(description);
                                return description;
                            }
                    }
                }
                else
                {
                    // At this point, all the child nodes have been converted into the childResults array.
                    Debug.Assert(node.ChildCount() > 0);
                    Debug.Assert(childResults is not null);
                    Debug.Assert(childResults.Length == node.ChildCount());
                    Debug.Assert(result.Count == 0);
 
                    switch (node.Kind)
                    {
                        case RegexNodeKind.Concatenate:
                            {
                                // Flatten the child results into the top-level result.
                                foreach (DoublyLinkedList<SymbolicRegexNode<BDD>> childResult in childResults)
                                {
                                    result.AddLast(childResult);
                                }
                                break;
                            }
 
                        case RegexNodeKind.Alternate:
                            {
                                // Alternations in SymbolicRegexNode are binary and always normalized to right associative
                                // form, so here the list of children is built into a tree of alternations.
                                // The order is kept to achieve the same semantics as the backtracking engines.
                                SymbolicRegexNode<BDD> or = _builder._nothing;
 
                                // Enumerate in reverse order through the child results
                                for (int i = childResults.Length - 1; i >= 0; i--)
                                {
                                    DoublyLinkedList<SymbolicRegexNode<BDD>> childResult = childResults[i];
 
                                    // If childResult is a non-singleton list, then it denotes a concatenation that must be constructed at this point.
                                    SymbolicRegexNode<BDD> elem = childResult.Count == 1 ?
                                        childResult.FirstElement :
                                        _builder.CreateConcatAlreadyReversed(childResult);
                                    if (elem.IsNothing(_builder._solver))
                                    {
                                        continue;
                                    }
 
                                    or = elem.IsAnyStar(_builder._solver) ?
                                        elem : // .* is the absorbing element
                                        SymbolicRegexNode<BDD>.CreateAlternate(_builder, elem, or);
                                }
                                result.AddLast(or);
                                break;
                            }
 
                        case RegexNodeKind.Loop:
                        case RegexNodeKind.Lazyloop:
                            {
                                Debug.Assert(childResults.Length == 1);
                                DoublyLinkedList<SymbolicRegexNode<BDD>> childResult = childResults[0];
 
                                // Convert a list of nodes into a concatenation
                                SymbolicRegexNode<BDD> body = childResult.Count == 1 ?
                                    childResult.FirstElement :
                                    _builder.CreateConcatAlreadyReversed(childResult);
                                result.AddLast(_builder.CreateLoop(body, node.Kind == RegexNodeKind.Lazyloop, node.M, node.N));
                                break;
                            }
 
                        default:
                            {
                                // No other nodes besides captures can have been pushed twice at this point.
                                Debug.Assert(node.Kind == RegexNodeKind.Capture && node.N == -1);
 
                                Debug.Assert(childResults.Length == 1);
                                DoublyLinkedList<SymbolicRegexNode<BDD>> childResult = childResults[0];
 
                                int captureNum = RegexParser.MapCaptureNumber(node.M, _captureSparseMapping);
 
                                // Add capture start/end markers
                                childResult.AddFirst(_builder.CreateCaptureStart(captureNum));
                                childResult.AddLast(_builder.CreateCaptureEnd(captureNum));
                                result.AddLast(childResult);
                                break;
                            }
                    }
                }
            }
 
            // Only a top-level concatenation or capture node can result in a non-singleton list.
            Debug.Assert(rootResult.Count == 1 || root.Kind == RegexNodeKind.Concatenate || root.Kind == RegexNodeKind.Capture);
 
            return rootResult.Count == 1 ?
                rootResult.FirstElement :
                _builder.CreateConcatAlreadyReversed(rootResult);
 
            void EnsureNewlinePredicateInitialized()
            {
                // Initialize the \n set in the builder if it has not been updated already
                if (_builder._newLineSet.Equals(_builder._solver.Empty))
                {
                    _builder._newLineSet = _builder._charSetSolver.CreateBDDFromChar('\n');
                }
            }
 
            void EnsureWordLetterPredicateInitialized()
            {
                // Initialize the word letter set based on the Unicode definition of it if it was not updated already
                if (_builder._wordLetterForBoundariesSet.Equals(_builder._solver.Empty))
                {
                    // Use the set including joiner and non-joiner
                    _builder._wordLetterForBoundariesSet = UnicodeCategoryConditions.WordLetterForAnchors((CharSetSolver)_builder._solver);
                }
            }
 
            SymbolicRegexNode<BDD> ConvertSet(RegexNode node)
            {
                Debug.Assert(node.Kind == RegexNodeKind.Set);
 
                string? set = node.Str;
                Debug.Assert(set is not null);
 
                return _builder.CreateSingleton(CreateBDDFromSetString(set));
            }
 
            DoublyLinkedList<SymbolicRegexNode<BDD>>[]? CreateChildResultArray(int k) => k == 0 ? null : new DoublyLinkedList<SymbolicRegexNode<BDD>>[k];
        }
 
        /// <summary>Creates a BDD from the <see cref="RegexCharClass"/> set string to determine whether a char is in the set.</summary>
        /// <param name="set">The RegexCharClass set string.</param>
        /// <returns>A BDD that, when queried with a char, answers whether that char is in the specified set.</returns>
        private BDD CreateBDDFromSetString(string set)
        {
            // If we're too deep on the stack, continue any recursion on another thread.
            if (!StackHelper.TryEnsureSufficientExecutionStack())
            {
                return StackHelper.CallOnEmptyStack(CreateBDDFromSetString, set);
            }
 
            // Lazily-initialize the set cache on first use, since some expressions may not have character classes in them.
            _setBddCache ??= new Dictionary<string, BDD>();
 
            // Try to get the cached BDD for the set key.
            // If one doesn't yet exist, compute and populate it.
            ref BDD? result = ref CollectionsMarshal.GetValueRefOrAddDefault(_setBddCache, set, out _);
            return result ??= Compute(set);
 
            // <summary>Parses the RegexCharClass set string and creates a BDD that represents the same condition.</summary>
            BDD Compute(string set)
            {
                List<BDD> conditions = new();
                var charSetSolver = (CharSetSolver)_builder._solver;
 
                // The set string is composed of four parts: flags (which today are just for negation), ranges (a list
                // of pairs of values representing the ranges a character that matches the set could fall in (or if it's
                // negated, fall out of), categories (a list of codes based on UnicodeCategory values), and then optionally
                // another entire set string that's subtracted from the outer set string.  We parse each of those pieces
                // to build up a BDD that will return true if a char matches the set string, and otherwise false. This
                // BDD then is functionally equivalent to RegexCharClass.CharInClass.
 
                bool negate = RegexCharClass.IsNegated(set);
 
                // Handle ranges
                // A BDD is created for each range, and is then negated if the set is negated.  All of the BDDs for
                // all of the ranges are stored in a set of these "conditions", which will later have all of the BDDs
                // and'd (conjunction) together if the set is negated, or or'd (disjunction) together if not negated.
                List<(char First, char Last)>? ranges = RegexCharClass.ComputeRanges(set);
                if (ranges is not null)
                {
                    foreach ((char first, char last) in ranges)
                    {
                        BDD bdd = charSetSolver.CreateBDDFromRange(first, last);
                        if (negate)
                        {
                            bdd = charSetSolver.Not(bdd);
                        }
                        conditions.Add(bdd);
                    }
                }
 
                // Handle categories
 
                const int UnicodeCategoryCount = 1 + (int)UnicodeCategory.OtherNotAssigned;
                Debug.Assert(!Enum.IsDefined((UnicodeCategory)UnicodeCategoryCount));
                Span<bool> categoryCodes = stackalloc bool[UnicodeCategoryCount];
 
                int setLength = set[RegexCharClass.SetLengthIndex];
                int catLength = set[RegexCharClass.CategoryLengthIndex];
                int catStart = setLength + RegexCharClass.SetStartIndex;
                int i = catStart;
                while (i < catStart + catLength)
                {
                    // TODO: This logic should really be part of RegexCharClass, as it's parsing a set
                    // string and ideally such logic would be internal to RegexCharClass.  It could expose
                    // a helper that returns the set of specified UnicodeCategory's and whether they're negated.
 
                    // Singleton categories are stored as values whose code is 1 + the UnicodeCategory value.
                    // Thus -1 is applied to extract the actual code of the category. The category itself may be
                    // negated, e.g. \D instead of \d.
                    short categoryCode = (short)set[i++];
                    if (categoryCode != 0)
                    {
                        // Create a BDD for the UnicodeCategory.  If the category code is negative, the category
                        // is negated, but the whole set string may also be negated, so we need to negate the condition
                        // if one and only one of these negations occurs.
                        BDD cond = MapCategoryCodeToCondition((UnicodeCategory)(Math.Abs(categoryCode) - 1));
                        if ((categoryCode < 0) ^ negate)
                        {
                            cond = charSetSolver.Not(cond);
                        }
                        conditions.Add(cond);
                        continue;
                    }
 
                    // Special case for a whole group G of categories surrounded by 0's.
                    // Essentially 0 C1 C2 ... Cn 0 ==> G = (C1 | C2 | ... | Cn)
                    categoryCode = (short)set[i++];
                    if (categoryCode == 0)
                    {
                        continue; // empty set of categories
                    }
 
                    // If the first catCode is negated, the group as a whole is negated
                    bool negatedGroup = categoryCode < 0;
 
                    // Collect individual category codes. At this point, the only codes that can be
                    // here are valid UnicodeCategories; SpaceConst is never used as part of groups,
                    // as such groups are only constructed in our own RegexCharClass consts, and it's
                    // not used there.
                    categoryCodes.Clear();
                    while (categoryCode != 0)
                    {
                        int cat = Math.Abs((int)categoryCode) - 1;
                        Debug.Assert(cat >= 0 && cat < categoryCodes.Length, $"Expected {cat} to be in range [0, {categoryCodes.Length}).");
                        categoryCodes[cat] = true;
                        categoryCode = (short)set[i++];
                    }
 
                    // Create a BDD that represents all of the categories or'd (disjunction) together (C1 | C2 | ... | Cn),
                    // then negate the result if necessary (noting that two negations cancel each other out... if the set
                    // is negated but then the group itself is also negated).  And add the resulting BDD to our set of conditions.
                    BDD bdd = MapCategoryCodeSetToCondition(categoryCodes);
                    if (negate ^ negatedGroup)
                    {
                        bdd = charSetSolver.Not(bdd);
                    }
                    conditions.Add(bdd);
                }
 
                // Handle subtraction
                BDD? subtractorCond = null;
                if (set.Length > i)
                {
                    // The set has a subtractor-set at the end.
                    // All characters in the subtractor-set are excluded from the set.
                    // Note that the subtractor sets may be nested, e.g. in r=[a-z-[b-g-[cd]]]
                    // the subtractor set [b-g-[cd]] has itself a subtractor set [cd].
                    // Thus r is the set of characters between a..z except b,e,f,g
                    subtractorCond = CreateBDDFromSetString(set.Substring(i));
                }
 
                // If there are no ranges and no groups then there are no conditions.
                // This situation arises in particular for RegexOptions.SingleLine with a . (dot),
                // which translates into a set string that accepts everything.
                BDD result = conditions.Count == 0 ?
                    (negate ? charSetSolver.Empty : charSetSolver.Full) :
                    (negate ? charSetSolver.And(CollectionsMarshal.AsSpan(conditions)) : charSetSolver.Or(CollectionsMarshal.AsSpan(conditions)));
 
                // Now apply the subtracted condition if there is one.  As a subtly of Regex semantics,
                // the subtractor is not within the scope of the negation (if there is any negation).
                // Thus we subtract after applying any negation above rather than before.  Subtraction
                // is achieved by negating the subtraction (such that the result of the negation represents
                // things still to be accepted after subtraction) and then and'ing it with the result, effectively
                // masking off anything matched by the subtraction set.
                if (subtractorCond is not null)
                {
                    result = charSetSolver.And(result, charSetSolver.Not(subtractorCond));
                }
 
                return result;
 
                // <summary>Creates a BDD that matches when a character is part of any of the specified UnicodeCategory values.</summary>
                BDD MapCategoryCodeSetToCondition(Span<bool> catCodes)
                {
                    // \w is so common, to help speed up construction we special-case it by using
                    // the combined \w set rather than an or (disjunction) of the component categories.
                    // This is done by validating that all of the categories for \w are there, and then removing
                    // them all if they are and instead starting our BDD off as \w.
                    BDD? result = null;
                    if (catCodes[(int)UnicodeCategory.UppercaseLetter] &&
                        catCodes[(int)UnicodeCategory.LowercaseLetter] &&
                        catCodes[(int)UnicodeCategory.TitlecaseLetter] &&
                        catCodes[(int)UnicodeCategory.ModifierLetter] &&
                        catCodes[(int)UnicodeCategory.OtherLetter] &&
                        catCodes[(int)UnicodeCategory.NonSpacingMark] &&
                        catCodes[(int)UnicodeCategory.DecimalDigitNumber] &&
                        catCodes[(int)UnicodeCategory.ConnectorPunctuation])
                    {
                        catCodes[(int)UnicodeCategory.UppercaseLetter] =
                        catCodes[(int)UnicodeCategory.LowercaseLetter] =
                        catCodes[(int)UnicodeCategory.TitlecaseLetter] =
                        catCodes[(int)UnicodeCategory.ModifierLetter] =
                        catCodes[(int)UnicodeCategory.OtherLetter] =
                        catCodes[(int)UnicodeCategory.NonSpacingMark] =
                        catCodes[(int)UnicodeCategory.DecimalDigitNumber] =
                        catCodes[(int)UnicodeCategory.ConnectorPunctuation] = false;
 
                        result = UnicodeCategoryConditions.WordLetter(charSetSolver);
                    }
 
                    // For any remaining categories, create a condition for each and
                    // or that into the resulting BDD.
                    for (int i = 0; i < catCodes.Length; i++)
                    {
                        if (catCodes[i])
                        {
                            BDD cond = MapCategoryCodeToCondition((UnicodeCategory)i);
                            result = result is null ? cond : charSetSolver.Or(result, cond);
                        }
                    }
 
                    Debug.Assert(result is not null);
                    return result;
                }
 
                // Gets the BDD for evaluating whether a character is part of the specified category.
                BDD MapCategoryCodeToCondition(UnicodeCategory code)
                {
                    Debug.Assert(Enum.IsDefined(code) || code == (UnicodeCategory)(RegexCharClass.SpaceConst - 1), $"Unknown category: {code}");
                    return code == (UnicodeCategory)(RegexCharClass.SpaceConst - 1) ?
                        UnicodeCategoryConditions.WhiteSpace :
                        UnicodeCategoryConditions.GetCategory(code);
                }
            }
        }
    }
}
File: System\Text\RegularExpressions\Symbolic\RegexNodeConverter.cs	Web Access
Project: src\src\libraries\System.Text.RegularExpressions\src\System.Text.RegularExpressions.csproj (System.Text.RegularExpressions)