RegexTreeAnalyzer.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Collections.Generic;
using System.Threading;
 
namespace System.Text.RegularExpressions
{
    /// <summary>Analyzes a <see cref="RegexTree"/> of <see cref="RegexNode"/>s to produce data on the tree structure, in particular in support of code generation.</summary>
    internal static class RegexTreeAnalyzer
    {
        /// <summary>Analyzes a <see cref="RegexInterpreterCode"/> to learn about the structure of the tree.</summary>
        public static AnalysisResults Analyze(RegexTree regexTree)
        {
            var results = new AnalysisResults(regexTree);
            results._complete = TryAnalyze(regexTree.Root, results, isAtomicByAncestor: true, isInLoop: false);
            return results;
 
            static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByAncestor, bool isInLoop)
            {
                if (!StackHelper.TryEnsureSufficientExecutionStack())
                {
                    return false;
                }
 
                // Track whether we've seen any nodes with various options set.
                results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0;
                results._hasRightToLeft |= (node.Options & RegexOptions.RightToLeft) != 0;
 
                // Track whether this node is inside of a loop.
                if (isInLoop)
                {
                    (results._inLoops ??= new HashSet<RegexNode>()).Add(node);
                }
 
                if (isAtomicByAncestor)
                {
                    // We've been told by our parent that we should be considered atomic, so add ourselves
                    // to the atomic collection.
                    results._isAtomicByAncestor.Add(node);
                }
                else
                {
                    // Certain kinds of nodes incur backtracking logic themselves: add them to the backtracking collection.
                    // We may later find that a node contains another that has backtracking; we'll add nodes based on that
                    // after examining the children.
                    switch (node.Kind)
                    {
                        case RegexNodeKind.Alternate:
                        case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M != node.N:
                        case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when node.M != node.N:
                            (results._mayBacktrack ??= new HashSet<RegexNode>()).Add(node);
                            break;
                    }
                }
 
                // Update state for certain node types.
                bool isAtomicBySelf = false;
                switch (node.Kind)
                {
                    // Some node types add atomicity around what they wrap.  Set isAtomicBySelfOrParent to true for such nodes
                    // even if it was false upon entering the method.
                    case RegexNodeKind.Atomic:
                    case RegexNodeKind.NegativeLookaround:
                    case RegexNodeKind.PositiveLookaround:
                        isAtomicBySelf = true;
                        break;
 
                    // Track any nodes that are themselves captures.
                    case RegexNodeKind.Capture:
                        results._containsCapture.Add(node);
                        break;
 
                    // Track whether we've recurred into a loop
                    case RegexNodeKind.Loop:
                    case RegexNodeKind.Lazyloop:
                        isInLoop = true;
                        break;
                }
 
                // Process each child.
                int childCount = node.ChildCount();
                for (int i = 0; i < childCount; i++)
                {
                    RegexNode child = node.Child(i);
 
                    // Determine whether the child should be treated as atomic (whether anything
                    // can backtrack into it), which is influenced by whether this node (the child's
                    // parent) is considered atomic by itself or by its parent.
                    bool treatChildAsAtomic = (isAtomicByAncestor | isAtomicBySelf) && node.Kind switch
                    {
                        // If the parent is atomic, so is the child.  That's the whole purpose
                        // of the Atomic node, and lookarounds are also implicitly atomic.
                        RegexNodeKind.Atomic or RegexNodeKind.NegativeLookaround or RegexNodeKind.PositiveLookaround => true,
 
                        // Each branch is considered independently, so any atomicity applied to the alternation also applies
                        // to each individual branch.  This is true as well for conditionals.
                        RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional => true,
 
                        // Captures don't impact atomicity: if the parent of a capture is atomic, the capture is also atomic.
                        RegexNodeKind.Capture => true,
 
                        // If the parent is a concatenation and this is the last node, any atomicity
                        // applying to the concatenation applies to this node, too.
                        RegexNodeKind.Concatenate => i == childCount - 1,
 
                        // For loops with a max iteration count of 1, they themselves can be considered
                        // atomic as can whatever they wrap, as they won't ever iterate more than once
                        // and thus we don't need to worry about one iteration consuming input destined
                        // for a subsequent iteration.
                        RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.N == 1 => true,
 
                        // For any other parent type, give up on trying to prove atomicity.
                        _ => false,
                    };
 
                    // Now analyze the child.
                    if (!TryAnalyze(child, results, treatChildAsAtomic, isInLoop))
                    {
                        return false;
                    }
 
                    // If the child contains captures, so too does this parent.
                    if (results._containsCapture.Contains(child))
                    {
                        results._containsCapture.Add(node);
                    }
 
                    // If the child might require backtracking into it, so too might the parent,
                    // unless the parent is itself considered atomic.  Here we don't consider parental
                    // atomicity, as we need to surface upwards to the parent whether any backtracking
                    // will be visible from this node to it.
                    if (!isAtomicBySelf && (results._mayBacktrack?.Contains(child) == true))
                    {
                        (results._mayBacktrack ??= new HashSet<RegexNode>()).Add(node);
                    }
                }
 
                // Successfully analyzed the node.
                return true;
            }
        }
    }
 
    /// <summary>Provides results of a tree analysis.</summary>
    internal sealed class AnalysisResults
    {
        /// <summary>Indicates whether the whole tree was successfully processed.</summary>
        /// <remarks>
        /// If it wasn't successfully processed, we have to assume the "worst", e.g. if it's
        /// false, we need to assume we didn't fully determine which nodes contain captures,
        /// and thus we need to assume they all do and not discard logic necessary to support
        /// captures.  It should be exceedingly rare that this is false.
        /// </remarks>
        internal bool _complete;
 
        /// <summary>Set of nodes that are considered to be atomic based on themselves or their ancestry.</summary>
        internal readonly HashSet<RegexNode> _isAtomicByAncestor = new(); // since the root is implicitly atomic, every tree will contain atomic-by-ancestor nodes
        /// <summary>Set of nodes that directly or indirectly contain capture groups.</summary>
        internal readonly HashSet<RegexNode> _containsCapture = new(); // the root is a capture, so this will always contain at least the root node
        /// <summary>Set of nodes that directly or indirectly contain backtracking constructs that aren't hidden internally by atomic constructs.</summary>
        internal HashSet<RegexNode>? _mayBacktrack;
        /// <summary>Set of nodes contained inside loops.</summary>
        internal HashSet<RegexNode>? _inLoops;
        /// <summary>Whether any node has <see cref="RegexOptions.IgnoreCase"/> set.</summary>
        internal bool _hasIgnoreCase;
        /// <summary>Whether any node has <see cref="RegexOptions.RightToLeft"/> set.</summary>
        internal bool _hasRightToLeft;
 
        /// <summary>Initializes the instance.</summary>
        /// <param name="regexTree">The code being analyzed.</param>
        internal AnalysisResults(RegexTree regexTree) => RegexTree = regexTree;
 
        /// <summary>Gets the code that was analyzed.</summary>
        public RegexTree RegexTree { get; }
 
        /// <summary>Gets whether a node is considered atomic based on its ancestry.</summary>
        /// <remarks>
        /// If the whole tree couldn't be examined, this returns false.  That could lead to additional
        /// code being output as nodes that could have been made atomic aren't, but functionally it's
        /// the safe choice.
        /// </remarks>
        public bool IsAtomicByAncestor(RegexNode node) => _isAtomicByAncestor.Contains(node);
 
        /// <summary>Gets whether a node directly or indirectly contains captures.</summary>
        /// <remarks>
        /// If the whole tree couldn't be examined, this returns true.  That could lead to additional
        /// code being emitted to deal with captures that can't occur, but functionally it's the
        /// safe choice.
        /// </remarks>
        public bool MayContainCapture(RegexNode node) => !_complete || _containsCapture.Contains(node);
 
        /// <summary>Gets whether a node is or directory or indirectly contains a backtracking construct that isn't hidden by an internal atomic construct.</summary>
        /// <remarks>
        /// In most code generation situations, we only need to know after we emit the child code whether
        /// the child may backtrack, and that we can see with 100% certainty.  This method is useful in situations
        /// where we need to predict without traversing the child at code generation time whether it may
        /// incur backtracking.  This method may have (few) false positives (return true when it could have
        /// returned false), but won't have any false negatives (return false when it should have returned true),
        /// meaning it might claim a node requires backtracking even if it doesn't, but it will always return
        /// true for any node that requires backtracking. In that vein, if the whole tree couldn't be examined,
        /// this returns true.
        /// </remarks>
        public bool MayBacktrack(RegexNode node) => !_complete || (_mayBacktrack?.Contains(node) ?? false);
 
        /// <summary>Gets whether a node may be contained inside of one or more loops.</summary>
        /// <remarks>
        /// Constructs sometimes need to maintain state about their execution such that if they're backtracked
        /// to, they have the necessary context to make a different choice and continue execution.  Such state
        /// can often be maintained in locals dedicated to that construct.  If, however, the construct is inside
        /// of a loop, an additional iteration of that outerloop could invoke the inner construct and cause those
        /// locals to have their state overwritten.  In such situations, the constructs can't rely on the locals
        /// maintaining the state and instead need to push the state on to a stack.  That pushing/popping has
        /// additional cost, however, both in terms of run-time overheads and in terms of the additional code
        /// required to handle it.  Code generators then can consult this <see cref="IsInLoop"/> to determine
        /// whether locals are sufficient to maintain the state or whether the state needs to be pushed on to a stack.
        ///
        /// Loops are not considered to be "in" themselves.  This will only return true for a loop node if it's
        /// nested inside of another loop node.
        ///
        /// If the whole tree couldn't be examined, this returns true.  That could lead to additional
        /// backtracking-related code being emitted, but it's functionally the safe choice.
        /// </remarks>
        public bool IsInLoop(RegexNode node) => !_complete || (_inLoops?.Contains(node) ?? false);
 
        /// <summary>Gets whether a node might have <see cref="RegexOptions.IgnoreCase"/> set.</summary>
        /// <remarks>
        /// If the whole tree couldn't be examined, this returns true.  That could lead to additional
        /// code being emitted to support case-insensitivity in expressions that don't actually need
        /// it, but functionally it's the safe choice.
        /// </remarks>
        public bool HasIgnoreCase => !_complete || _hasIgnoreCase;
 
        /// <summary>Gets whether a node might have <see cref="RegexOptions.RightToLeft"/> set.</summary>
        /// <remarks>
        /// If the whole tree couldn't be examined, this returns true.  That could lead to additional
        /// code being emitted to support expressions that don't actually contain any RightToLeft
        /// nodes, but functionally it's the safe choice.
        /// </remarks>
        public bool HasRightToLeft => !_complete || _hasRightToLeft;
    }
}
File: System\Text\RegularExpressions\RegexTreeAnalyzer.cs	Web Access
Project: src\src\libraries\System.Text.RegularExpressions\src\System.Text.RegularExpressions.csproj (System.Text.RegularExpressions)