|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Threading;
namespace System.Text.RegularExpressions
{
/// <summary>Represents a regex subexpression.</summary>
internal sealed class RegexNode
{
/// <summary>Arbitrary number of repetitions of the same character when we'd prefer to represent that as a repeater of that character rather than a string.</summary>
internal const int MultiVsRepeaterLimit = 64;
/// <summary>The node's children.</summary>
/// <remarks>null if no children, a <see cref="RegexNode"/> if one child, or a <see cref="List{RegexNode}"/> if multiple children.</remarks>
private object? Children;
/// <summary>The kind of expression represented by this node.</summary>
public RegexNodeKind Kind { get; private set; }
/// <summary>A string associated with the node.</summary>
/// <remarks>For a <see cref="RegexNodeKind.Multi"/>, this is the string from the expression. For an <see cref="IsSetFamily"/> node, this is the character class string from <see cref="RegexCharClass"/>.</remarks>
public string? Str { get; private set; }
/// <summary>The character associated with the node.</summary>
/// <remarks>For a <see cref="IsOneFamily"/> or <see cref="IsNotoneFamily"/> node, the character from the expression.</remarks>
public char Ch { get; private set; }
/// <summary>The minimum number of iterations for a loop, or the capture group number for a capture or backreference.</summary>
/// <remarks>No minimum is represented by 0. No capture group is represented by -1.</remarks>
public int M { get; private set; }
/// <summary>The maximum number of iterations for a loop, or the uncapture group number for a balancing group.</summary>
/// <remarks>No upper bound is represented by <see cref="int.MaxValue"/>. No capture group is represented by -1.</remarks>
public int N { get; private set; }
/// <summary>The options associated with the node.</summary>
public RegexOptions Options;
/// <summary>The node's parent node in the tree.</summary>
/// <remarks>
/// During parsing, top-level nodes are also stacked onto a parse stack (a stack of trees) using <see cref="Parent"/>.
/// After parsing, <see cref="Parent"/> is the node in the tree that has this node as or in <see cref="Children"/>.
/// </remarks>
public RegexNode? Parent;
public RegexNode(RegexNodeKind kind, RegexOptions options)
{
Kind = kind;
Options = options;
}
public RegexNode(RegexNodeKind kind, RegexOptions options, char ch)
{
Kind = kind;
Options = options;
Ch = ch;
}
public RegexNode(RegexNodeKind kind, RegexOptions options, string str)
{
Kind = kind;
Options = options;
Str = str;
}
public RegexNode(RegexNodeKind kind, RegexOptions options, int m)
{
Kind = kind;
Options = options;
M = m;
}
public RegexNode(RegexNodeKind kind, RegexOptions options, int m, int n)
{
Kind = kind;
Options = options;
M = m;
N = n;
}
/// <summary>Creates a new node from an existing one/notone/setone {lazy/atomic} loop with one less iteration.</summary>
public RegexNode CloneCharLoopWithOneLessIteration()
{
Debug.Assert(Kind is RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or
RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or
RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic);
Debug.Assert(M > 0);
RegexNode newNode = IsSetFamily ?
new RegexNode(Kind, Options, Str!) :
new RegexNode(Kind, Options, Ch);
newNode.M = M - 1;
newNode.N = N == int.MaxValue ? int.MaxValue : N - 1;
return newNode;
}
/// <summary>Creates a RegexNode representing a single character.</summary>
/// <param name="ch">The character.</param>
/// <param name="options">The node's options.</param>
/// <param name="culture">The culture to use to perform any required transformations.</param>
/// <param name="caseBehavior">The behavior to be used for case comparisons. If the value hasn't been set yet, it will get initialized in the first lookup.</param>
/// <returns>The created RegexNode. This might be a RegexNode.One or a RegexNode.Set.</returns>
public static RegexNode CreateOneWithCaseConversion(char ch, RegexOptions options, CultureInfo? culture, ref RegexCaseBehavior caseBehavior)
{
// If the options specify case-insensitivity, we try to create a node that fully encapsulates that.
if ((options & RegexOptions.IgnoreCase) != 0)
{
Debug.Assert(culture is not null);
if (!RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior(ch, culture, ref caseBehavior, out ReadOnlySpan<char> equivalences))
{
// If we reach here, then we know that ch does not participate in case conversion, so we just
// create a One node with it and strip out the IgnoreCase option.
return new RegexNode(RegexNodeKind.One, options & ~RegexOptions.IgnoreCase, ch);
}
// If it does participate in case conversion, then transform the Node into a set with
// all possible valid values and remove the IgnoreCase option to make the node case-sensitive.
string stringSet = RegexCharClass.CharsToStringClass(equivalences);
return new RegexNode(RegexNodeKind.Set, options & ~RegexOptions.IgnoreCase, stringSet);
}
// Create a One node for the character.
return new RegexNode(RegexNodeKind.One, options, ch);
}
/// <summary>Reverses all children of a concatenation when in RightToLeft mode.</summary>
public RegexNode ReverseConcatenationIfRightToLeft()
{
if ((Options & RegexOptions.RightToLeft) != 0 &&
Kind == RegexNodeKind.Concatenate &&
ChildCount() > 1)
{
((List<RegexNode>)Children!).Reverse();
}
return this;
}
/// <summary>
/// Pass type as OneLazy or OneLoop
/// </summary>
private void MakeRep(RegexNodeKind kind, int min, int max)
{
Kind += kind - RegexNodeKind.One;
M = min;
N = max;
}
private void MakeLoopAtomic()
{
switch (Kind)
{
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop:
// For loops, we simply change the Type to the atomic variant.
// Atomic greedy loops should consume as many values as they can.
Kind += RegexNodeKind.Oneloopatomic - RegexNodeKind.Oneloop;
break;
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy:
// For lazy, we not only change the Type, we also lower the max number of iterations
// to the minimum number of iterations, creating a repeater, as they should end up
// matching as little as possible.
Kind += RegexNodeKind.Oneloopatomic - RegexNodeKind.Onelazy;
N = M;
if (N == 0)
{
// If moving the max to be the same as the min dropped it to 0, there's no
// work to be done for this node, and we can make it Empty.
Kind = RegexNodeKind.Empty;
Str = null;
Ch = '\0';
}
else if (Kind == RegexNodeKind.Oneloopatomic && N is >= 2 and <= MultiVsRepeaterLimit)
{
// If this is now a One repeater with a small enough length,
// make it a Multi instead, as they're better optimized down the line.
Kind = RegexNodeKind.Multi;
Str = new string(Ch, N);
Ch = '\0';
M = N = 0;
}
break;
default:
Debug.Fail($"Unexpected type: {Kind}");
break;
}
}
#if DEBUG
/// <summary>Validate invariants the rest of the implementation relies on for processing fully-built trees.</summary>
[Conditional("DEBUG")]
private void ValidateFinalTreeInvariants()
{
Debug.Assert(Kind == RegexNodeKind.Capture, "Every generated tree should begin with a capture node");
var toExamine = new Stack<RegexNode>();
toExamine.Push(this);
while (toExamine.Count > 0)
{
RegexNode node = toExamine.Pop();
// Add all children to be examined
int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
{
RegexNode child = node.Child(i);
Debug.Assert(child.Parent == node, $"{child.Describe()} missing reference to parent {node.Describe()}");
toExamine.Push(child);
}
// Validate that we never see certain node types.
Debug.Assert(Kind != RegexNodeKind.Group, "All Group nodes should have been removed.");
// Validate node types and expected child counts.
switch (node.Kind)
{
case RegexNodeKind.Group:
Debug.Fail("All Group nodes should have been removed.");
break;
case RegexNodeKind.Beginning:
case RegexNodeKind.Bol:
case RegexNodeKind.Boundary:
case RegexNodeKind.ECMABoundary:
case RegexNodeKind.Empty:
case RegexNodeKind.End:
case RegexNodeKind.EndZ:
case RegexNodeKind.Eol:
case RegexNodeKind.Multi:
case RegexNodeKind.NonBoundary:
case RegexNodeKind.NonECMABoundary:
case RegexNodeKind.Nothing:
case RegexNodeKind.Notone:
case RegexNodeKind.Notonelazy:
case RegexNodeKind.Notoneloop:
case RegexNodeKind.Notoneloopatomic:
case RegexNodeKind.One:
case RegexNodeKind.Onelazy:
case RegexNodeKind.Oneloop:
case RegexNodeKind.Oneloopatomic:
case RegexNodeKind.Backreference:
case RegexNodeKind.Set:
case RegexNodeKind.Setlazy:
case RegexNodeKind.Setloop:
case RegexNodeKind.Setloopatomic:
case RegexNodeKind.Start:
case RegexNodeKind.UpdateBumpalong:
Debug.Assert(childCount == 0, $"Expected zero children for {node.Kind}, got {childCount}.");
break;
case RegexNodeKind.Atomic:
case RegexNodeKind.Capture:
case RegexNodeKind.Lazyloop:
case RegexNodeKind.Loop:
case RegexNodeKind.NegativeLookaround:
case RegexNodeKind.PositiveLookaround:
Debug.Assert(childCount == 1, $"Expected one and only one child for {node.Kind}, got {childCount}.");
break;
case RegexNodeKind.BackreferenceConditional:
Debug.Assert(childCount == 2, $"Expected two children for {node.Kind}, got {childCount}");
break;
case RegexNodeKind.ExpressionConditional:
Debug.Assert(childCount == 3, $"Expected three children for {node.Kind}, got {childCount}");
break;
case RegexNodeKind.Concatenate:
case RegexNodeKind.Alternate:
Debug.Assert(childCount >= 2, $"Expected at least two children for {node.Kind}, got {childCount}.");
break;
default:
Debug.Fail($"Unexpected node type: {node.Kind}");
break;
}
// Validate node configuration.
switch (node.Kind)
{
case RegexNodeKind.Multi:
Debug.Assert(node.Str is not null, "Expect non-null multi string");
Debug.Assert(node.Str.Length >= 2, $"Expected {node.Str} to be at least two characters");
break;
case RegexNodeKind.Set:
case RegexNodeKind.Setloop:
case RegexNodeKind.Setloopatomic:
case RegexNodeKind.Setlazy:
Debug.Assert(!string.IsNullOrEmpty(node.Str), $"Expected non-null, non-empty string for {node.Kind}.");
break;
default:
Debug.Assert(node.Str is null, $"Expected null string for {node.Kind}, got \"{node.Str}\".");
break;
}
// Validate only Backreference nodes have IgnoreCase Option
switch (node.Kind)
{
case RegexNodeKind.Backreference:
break;
default:
Debug.Assert((node.Options & RegexOptions.IgnoreCase) == 0, $"{node.Kind} node should not have RegexOptions.IgnoreCase");
break;
}
}
}
#endif
/// <summary>Performs additional optimizations on an entire tree prior to being used.</summary>
/// <remarks>
/// Some optimizations are performed by the parser while parsing, and others are performed
/// as nodes are being added to the tree. The optimizations here expect the tree to be fully
/// formed, as they inspect relationships between nodes that may not have been in place as
/// individual nodes were being processed/added to the tree.
/// </remarks>
internal RegexNode FinalOptimize()
{
RegexNode rootNode = this;
Debug.Assert(rootNode.Kind == RegexNodeKind.Capture);
Debug.Assert(rootNode.Parent is null);
Debug.Assert(rootNode.ChildCount() == 1);
// Only apply optimization when LTR to avoid needing additional code for the much rarer RTL case.
// Also only apply these optimizations when not using NonBacktracking, as these optimizations are
// all about avoiding things that are impactful for the backtracking engines but nops for non-backtracking.
if ((Options & (RegexOptions.RightToLeft | RegexOptions.NonBacktracking)) == 0)
{
// Optimization: eliminate backtracking for loops.
// For any single-character loop (Oneloop, Notoneloop, Setloop), see if we can automatically convert
// that into its atomic counterpart (Oneloopatomic, Notoneloopatomic, Setloopatomic) based on what
// comes after it in the expression tree.
rootNode.FindAndMakeLoopsAtomic();
// Optimization: backtracking removal at expression end.
// If we find backtracking construct at the end of the regex, we can instead make it non-backtracking,
// since nothing would ever backtrack into it anyway. Doing this then makes the construct available
// to implementations that don't support backtracking.
rootNode.EliminateEndingBacktracking();
// Optimization: unnecessary re-processing of starting loops.
// If an expression is guaranteed to begin with a single-character unbounded loop that isn't part of an alternation (in which case it
// wouldn't be guaranteed to be at the beginning) or a capture (in which case a back reference could be influenced by its length), then we
// can update the tree with a temporary node to indicate that the implementation should use that node's ending position in the input text
// as the next starting position at which to start the next match. This avoids redoing matches we've already performed, e.g. matching
// "\w+@dot.net" against "is this a valid address@dot.net", the \w+ will initially match the "is" and then will fail to match the "@".
// Rather than bumping the scan loop by 1 and trying again to match at the "s", we can instead start at the " ". For functional correctness
// we can only consider unbounded loops, as to be able to start at the end of the loop we need the loop to have consumed all possible matches;
// otherwise, you could end up with a pattern like "a{1,3}b" matching against "aaaabc", which should match, but if we pre-emptively stop consuming
// after the first three a's and re-start from that position, we'll end up failing the match even though it should have succeeded. We can also
// apply this optimization to non-atomic loops: even though backtracking could be necessary, such backtracking would be handled within the processing
// of a single starting position. Lazy loops similarly benefit, as a failed match will result in exploring the exact same search space as with
// a greedy loop, just in the opposite order (and a successful match will overwrite the bumpalong position); we need to avoid atomic lazy loops,
// however, as they will only end up as a repeater for the minimum length and thus will effectively end up with a non-infinite upper bound, which
// we've already outlined is problematic.
{
RegexNode node = rootNode.Child(0); // skip implicit root capture node
bool atomicByAncestry = true; // the root is implicitly atomic because nothing comes after it (same for the implicit root capture)
while (true)
{
switch (node.Kind)
{
case RegexNodeKind.Atomic:
node = node.Child(0);
continue;
case RegexNodeKind.Concatenate:
atomicByAncestry = false;
node = node.Child(0);
continue;
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when node.N == int.MaxValue:
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when node.N == int.MaxValue && !atomicByAncestry:
if (node.Parent is { Kind: RegexNodeKind.Concatenate } parent)
{
parent.InsertChild(1, new RegexNode(RegexNodeKind.UpdateBumpalong, node.Options));
}
break;
}
break;
}
}
}
// Done optimizing. Return the final tree.
#if DEBUG
rootNode.ValidateFinalTreeInvariants();
#endif
return rootNode;
}
/// <summary>Converts nodes at the end of the node tree to be atomic.</summary>
/// <remarks>
/// The correctness of this optimization depends on nothing being able to backtrack into
/// the provided node. That means it must be at the root of the overall expression, or
/// it must be an Atomic node that nothing will backtrack into by the very nature of Atomic.
/// </remarks>
private void EliminateEndingBacktracking()
{
if (!StackHelper.TryEnsureSufficientExecutionStack() ||
(Options & (RegexOptions.RightToLeft | RegexOptions.NonBacktracking)) != 0)
{
// If we can't recur further, just stop optimizing.
// We haven't done the work to validate this is correct for RTL.
// And NonBacktracking doesn't support atomic groups and doesn't have backtracking to be eliminated.
return;
}
// Walk the tree starting from the current node.
RegexNode node = this;
while (true)
{
switch (node.Kind)
{
// {One/Notone/Set}loops can be upgraded to {One/Notone/Set}loopatomic nodes, e.g. [abc]* => (?>[abc]*).
// And {One/Notone/Set}lazys can similarly be upgraded to be atomic, which really makes them into repeaters
// or even empty nodes.
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop:
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy:
node.MakeLoopAtomic();
break;
// Just because a particular node is atomic doesn't mean all its descendants are.
// Process them as well. Lookarounds are implicitly atomic.
case RegexNodeKind.Atomic:
case RegexNodeKind.PositiveLookaround:
case RegexNodeKind.NegativeLookaround:
node = node.Child(0);
continue;
// For Capture and Concatenate, we just recur into their last child (only child in the case
// of Capture). However, if the child is an alternation or loop, we can also make the
// node itself atomic by wrapping it in an Atomic node. Since we later check to see whether a
// node is atomic based on its parent or grandparent, we don't bother wrapping such a node in
// an Atomic one if its grandparent is already Atomic.
// e.g. [xyz](?:abc|def) => [xyz](?>abc|def)
case RegexNodeKind.Capture:
case RegexNodeKind.Concatenate:
RegexNode existingChild = node.Child(node.ChildCount() - 1);
if ((existingChild.Kind is RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional or RegexNodeKind.Loop or RegexNodeKind.Lazyloop) &&
(node.Parent is null || node.Parent.Kind != RegexNodeKind.Atomic)) // validate grandparent isn't atomic
{
var atomic = new RegexNode(RegexNodeKind.Atomic, existingChild.Options);
atomic.AddChild(existingChild);
node.ReplaceChild(node.ChildCount() - 1, atomic);
}
node = existingChild;
continue;
// For alternate, we can recur into each branch separately. We use this iteration for the first branch.
// Conditionals are just like alternations in this regard.
// e.g. abc*|def* => ab(?>c*)|de(?>f*)
case RegexNodeKind.Alternate:
case RegexNodeKind.BackreferenceConditional:
case RegexNodeKind.ExpressionConditional:
{
int branches = node.ChildCount();
for (int i = 1; i < branches; i++)
{
node.Child(i).EliminateEndingBacktracking();
}
if (node.Kind != RegexNodeKind.ExpressionConditional) // ReduceExpressionConditional will have already applied ending backtracking removal
{
node = node.Child(0);
continue;
}
}
break;
// For {Lazy}Loop, we search to see if there's a viable last expression, and iff there
// is we recur into processing it. Also, as with the single-char lazy loops, LazyLoop
// can have its max iteration count dropped to its min iteration count, as there's no
// reason for it to match more than the minimal at the end; that in turn makes it a
// repeater, which results in better code generation.
// e.g. (?:abc*)* => (?:ab(?>c*))*
// e.g. (abc*?)+? => (ab){1}
case RegexNodeKind.Lazyloop:
node.N = node.M;
goto case RegexNodeKind.Loop;
case RegexNodeKind.Loop:
{
if (node.N == 1)
{
// If the loop has a max iteration count of 1 (e.g. it's an optional node),
// there's no possibility for conflict between multiple iterations, so
// we can process it.
node = node.Child(0);
continue;
}
RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic();
if (loopDescendent != null)
{
node = loopDescendent;
continue; // loop around to process node
}
}
break;
}
break;
}
}
/// <summary>
/// Removes redundant nodes from the subtree, and returns an optimized subtree.
/// </summary>
internal RegexNode Reduce()
{
// Remove IgnoreCase option from everything except a Backreference
switch (Kind)
{
default:
// No effect
Options &= ~RegexOptions.IgnoreCase;
break;
case RegexNodeKind.Backreference:
// Still meaningful
break;
}
return Kind switch
{
RegexNodeKind.Alternate => ReduceAlternation(),
RegexNodeKind.Atomic => ReduceAtomic(),
RegexNodeKind.Concatenate => ReduceConcatenation(),
RegexNodeKind.Group => ReduceGroup(),
RegexNodeKind.Loop or RegexNodeKind.Lazyloop => ReduceLoops(),
RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround => ReduceLookaround(),
RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy => ReduceSet(),
RegexNodeKind.ExpressionConditional => ReduceExpressionConditional(),
RegexNodeKind.BackreferenceConditional => ReduceBackreferenceConditional(),
_ => this,
};
}
/// <summary>Remove an unnecessary Concatenation or Alternation node</summary>
/// <remarks>
/// Simple optimization for a concatenation or alternation:
/// - if the node has only one child, use it instead
/// - if the node has zero children, turn it into an empty with Nothing for an alternation or Empty for a concatenation
/// </remarks>
private RegexNode ReplaceNodeIfUnnecessary()
{
Debug.Assert(Kind is RegexNodeKind.Alternate or RegexNodeKind.Concatenate);
return ChildCount() switch
{
0 => new RegexNode(Kind == RegexNodeKind.Alternate ? RegexNodeKind.Nothing : RegexNodeKind.Empty, Options),
1 => Child(0),
_ => this,
};
}
/// <summary>Remove all non-capturing groups.</summary>
/// <remark>
/// Simple optimization: once parsed into a tree, non-capturing groups
/// serve no function, so strip them out.
/// e.g. (?:(?:(?:abc))) => abc
/// </remark>
private RegexNode ReduceGroup()
{
Debug.Assert(Kind == RegexNodeKind.Group);
RegexNode u = this;
while (u.Kind == RegexNodeKind.Group)
{
Debug.Assert(u.ChildCount() == 1);
u = u.Child(0);
}
return u;
}
/// <summary>
/// Remove unnecessary atomic nodes, and make appropriate descendents of the atomic node themselves atomic.
/// </summary>
/// <remarks>
/// e.g. (?>(?>(?>a*))) => (?>a*)
/// e.g. (?>(abc*)*) => (?>(abc(?>c*))*)
/// </remarks>
private RegexNode ReduceAtomic()
{
// RegexOptions.NonBacktracking doesn't support atomic groups, so when that option
// is set we don't want to create atomic groups where they weren't explicitly authored.
if ((Options & RegexOptions.NonBacktracking) != 0)
{
return this;
}
Debug.Assert(Kind == RegexNodeKind.Atomic);
Debug.Assert(ChildCount() == 1);
RegexNode atomic = this;
RegexNode child = Child(0);
while (child.Kind == RegexNodeKind.Atomic)
{
atomic = child;
child = atomic.Child(0);
}
switch (child.Kind)
{
// If the child is empty/nothing, there's nothing to be made atomic so the Atomic
// node can simply be removed.
case RegexNodeKind.Empty:
case RegexNodeKind.Nothing:
return child;
// If the child is already atomic, we can just remove the atomic node.
case RegexNodeKind.Oneloopatomic:
case RegexNodeKind.Notoneloopatomic:
case RegexNodeKind.Setloopatomic:
return child;
// If an atomic subexpression contains only a {one/notone/set}{loop/lazy},
// change it to be an {one/notone/set}loopatomic and remove the atomic node.
case RegexNodeKind.Oneloop:
case RegexNodeKind.Notoneloop:
case RegexNodeKind.Setloop:
case RegexNodeKind.Onelazy:
case RegexNodeKind.Notonelazy:
case RegexNodeKind.Setlazy:
child.MakeLoopAtomic();
return child;
// Alternations have a variety of possible optimizations that can be applied
// iff they're atomic.
case RegexNodeKind.Alternate:
if ((Options & RegexOptions.RightToLeft) == 0)
{
List<RegexNode>? branches = child.Children as List<RegexNode>;
Debug.Assert(branches is not null && branches.Count != 0);
// If an alternation is atomic and its first branch is Empty, the whole thing
// is a nop, as Empty will match everything trivially, and no backtracking
// into the node will be performed, making the remaining branches irrelevant.
if (branches[0].Kind == RegexNodeKind.Empty)
{
return new RegexNode(RegexNodeKind.Empty, child.Options);
}
// Similarly, we can trim off any branches after an Empty, as they'll never be used.
// An Empty will match anything, and thus branches after that would only be used
// if we backtracked into it and advanced passed the Empty after trying the Empty...
// but if the alternation is atomic, such backtracking won't happen.
for (int i = 1; i < branches.Count - 1; i++)
{
if (branches[i].Kind == RegexNodeKind.Empty)
{
branches.RemoveRange(i + 1, branches.Count - (i + 1));
break;
}
}
// If an alternation is atomic, we won't ever backtrack back into it, which
// means order matters but not repetition. With backtracking, it would be incorrect
// to convert an expression like "hi|there|hello" into "hi|hello|there", as doing
// so could then change the order of results if we matched "hi" and then failed
// based on what came after it, and both "hello" and "there" could be successful
// with what came later. But without backtracking, we can reorder "hi|there|hello"
// to instead be "hi|hello|there", as "hello" and "there" can't match the same text,
// and once this atomic alternation has matched, we won't try another branch. This
// reordering is valuable as it then enables further optimizations, e.g.
// "hi|there|hello" => "hi|hello|there" => "h(?:i|ello)|there", which means we only
// need to check the 'h' once in case it's not an 'h', and it's easier to employ different
// code gen that, for example, switches on first character of the branches, enabling faster
// choice of branch without always having to walk through each.
bool reordered = false;
for (int start = 0; start < branches.Count; start++)
{
// Get the node that may start our range. If it's a one, multi, or concat of those, proceed.
RegexNode startNode = branches[start];
if (startNode.FindBranchOneOrMultiStart() is null)
{
continue;
}
// Find the contiguous range of nodes from this point that are similarly one, multi, or concat of those.
int endExclusive = start + 1;
while (endExclusive < branches.Count && branches[endExclusive].FindBranchOneOrMultiStart() is not null)
{
endExclusive++;
}
// If there's at least 3, there may be something to reorder (we won't reorder anything
// before the starting position, and so only 2 items is considered ordered).
if (endExclusive - start >= 3)
{
int compare = start;
while (compare < endExclusive)
{
// Get the starting character
char c = branches[compare].FindBranchOneOrMultiStart()!.FirstCharOfOneOrMulti();
// Move compare to point to the last branch that has the same starting value.
while (compare < endExclusive && branches[compare].FindBranchOneOrMultiStart()!.FirstCharOfOneOrMulti() == c)
{
compare++;
}
// Compare now points to the first node that doesn't match the starting node.
// If we've walked off our range, there's nothing left to reorder.
if (compare < endExclusive)
{
// There may be something to reorder. See if there are any other nodes that begin with the same character.
for (int next = compare + 1; next < endExclusive; next++)
{
RegexNode nextChild = branches[next];
if (nextChild.FindBranchOneOrMultiStart()!.FirstCharOfOneOrMulti() == c)
{
branches.RemoveAt(next);
branches.Insert(compare++, nextChild);
reordered = true;
}
}
}
}
}
// Move to the end of the range we've now explored. endExclusive is not a viable
// starting position either, and the start++ for the loop will thus take us to
// the next potential place to start a range.
start = endExclusive;
}
// If anything was reordered, there may be new optimization opportunities inside
// of the alternation, so reduce it again.
if (reordered)
{
atomic.ReplaceChild(0, child);
child = atomic.Child(0);
}
}
goto default;
// For everything else, try to reduce ending backtracking of the last contained expression.
default:
child.EliminateEndingBacktracking();
return atomic;
}
}
/// <summary>Combine nested loops where applicable.</summary>
/// <remarks>
/// Nested repeaters just get multiplied with each other if they're not too lumpy.
/// Other optimizations may have also resulted in {Lazy}loops directly containing
/// sets, ones, and notones, in which case they can be transformed into the corresponding
/// individual looping constructs.
/// </remarks>
private RegexNode ReduceLoops()
{
Debug.Assert(Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop);
RegexNode u = this;
RegexNodeKind kind = Kind;
int min = M;
int max = N;
while (u.ChildCount() > 0)
{
RegexNode child = u.Child(0);
// multiply reps of the same type only
if (child.Kind != kind)
{
bool valid = false;
if (kind == RegexNodeKind.Loop)
{
switch (child.Kind)
{
case RegexNodeKind.Oneloop:
case RegexNodeKind.Oneloopatomic:
case RegexNodeKind.Notoneloop:
case RegexNodeKind.Notoneloopatomic:
case RegexNodeKind.Setloop:
case RegexNodeKind.Setloopatomic:
valid = true;
break;
}
}
else // type == Lazyloop
{
switch (child.Kind)
{
case RegexNodeKind.Onelazy:
case RegexNodeKind.Notonelazy:
case RegexNodeKind.Setlazy:
valid = true;
break;
}
}
if (!valid)
{
break;
}
}
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
// [but things like (a {2,})+ are not too lumpy...]
if (u.M == 0 && child.M > 1 || child.N < child.M * 2)
{
break;
}
u = child;
if (u.M > 0)
{
u.M = min = ((int.MaxValue - 1) / u.M < min) ? int.MaxValue : u.M * min;
}
if (u.N > 0)
{
u.N = max = ((int.MaxValue - 1) / u.N < max) ? int.MaxValue : u.N * max;
}
}
if (min == int.MaxValue)
{
return new RegexNode(RegexNodeKind.Nothing, Options);
}
// If the Loop or Lazyloop now only has one child node and its a Set, One, or Notone,
// reduce to just Setloop/lazy, Oneloop/lazy, or Notoneloop/lazy. The parser will
// generally have only produced the latter, but other reductions could have exposed
// this.
if (u.ChildCount() == 1)
{
RegexNode child = u.Child(0);
switch (child.Kind)
{
case RegexNodeKind.One:
case RegexNodeKind.Notone:
case RegexNodeKind.Set:
child.MakeRep(u.Kind == RegexNodeKind.Lazyloop ? RegexNodeKind.Onelazy : RegexNodeKind.Oneloop, u.M, u.N);
u = child;
break;
}
}
return u;
}
/// <summary>
/// Reduces set-related nodes to simpler one-related and notone-related nodes, where applicable.
/// </summary>
/// <remarks>
/// e.g.
/// [a] => a
/// [a]* => a*
/// [a]*? => a*?
/// (?>[a]*) => (?>a*)
/// [^a] => ^a
/// []* => Nothing
/// </remarks>
private RegexNode ReduceSet()
{
// Extract empty-set, one, and not-one case as special
Debug.Assert(Kind is RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy);
Debug.Assert(!string.IsNullOrEmpty(Str));
if (RegexCharClass.IsEmpty(Str))
{
Kind = RegexNodeKind.Nothing;
Str = null;
}
else if (RegexCharClass.IsSingleton(Str))
{
Ch = RegexCharClass.SingletonChar(Str);
Str = null;
Kind =
Kind == RegexNodeKind.Set ? RegexNodeKind.One :
Kind == RegexNodeKind.Setloop ? RegexNodeKind.Oneloop :
Kind == RegexNodeKind.Setloopatomic ? RegexNodeKind.Oneloopatomic :
RegexNodeKind.Onelazy;
}
else if (RegexCharClass.IsSingletonInverse(Str))
{
Ch = RegexCharClass.SingletonChar(Str);
Str = null;
Kind =
Kind == RegexNodeKind.Set ? RegexNodeKind.Notone :
Kind == RegexNodeKind.Setloop ? RegexNodeKind.Notoneloop :
Kind == RegexNodeKind.Setloopatomic ? RegexNodeKind.Notoneloopatomic :
RegexNodeKind.Notonelazy;
}
// Normalize some well-known sets
switch (Str)
{
// Different ways of saying "match anything"
case RegexCharClass.WordNotWordClass:
case RegexCharClass.NotWordWordClass:
case RegexCharClass.DigitNotDigitClass:
case RegexCharClass.NotDigitDigitClass:
case RegexCharClass.SpaceNotSpaceClass:
case RegexCharClass.NotSpaceSpaceClass:
Str = RegexCharClass.AnyClass;
break;
}
return this;
}
/// <summary>Optimize an alternation.</summary>
private RegexNode ReduceAlternation()
{
Debug.Assert(Kind == RegexNodeKind.Alternate);
switch (ChildCount())
{
case 0:
return new RegexNode(RegexNodeKind.Nothing, Options);
case 1:
return Child(0);
default:
ReduceSingleLetterAndNestedAlternations();
RegexNode node = ReplaceNodeIfUnnecessary();
if (node.Kind == RegexNodeKind.Alternate)
{
node = ExtractCommonPrefixText(node);
if (node.Kind == RegexNodeKind.Alternate)
{
node = ExtractCommonPrefixOneNotoneSet(node);
if (node.Kind == RegexNodeKind.Alternate)
{
node = RemoveRedundantEmptiesAndNothings(node);
}
}
}
return node;
}
// This function performs two optimizations:
// - Single-letter alternations can be replaced by faster set specifications
// e.g. "a|b|c|def|g|h" -> "[a-c]|def|[gh]"
// - Nested alternations with no intervening operators can be flattened:
// e.g. "apple|(?:orange|pear)|grape" -> "apple|orange|pear|grape"
void ReduceSingleLetterAndNestedAlternations()
{
bool wasLastSet = false;
bool lastNodeCannotMerge = false;
RegexOptions optionsLast = 0;
RegexOptions optionsAt;
int i;
int j;
RegexNode at;
RegexNode prev;
List<RegexNode> children = (List<RegexNode>)Children!;
for (i = 0, j = 0; i < children.Count; i++, j++)
{
at = children[i];
if (j < i)
children[j] = at;
while (true)
{
if (at.Kind == RegexNodeKind.Alternate)
{
if (at.Children is List<RegexNode> atChildren)
{
for (int k = 0; k < atChildren.Count; k++)
{
atChildren[k].Parent = this;
}
children.InsertRange(i + 1, atChildren);
}
else
{
RegexNode atChild = (RegexNode)at.Children!;
atChild.Parent = this;
children.Insert(i + 1, atChild);
}
j--;
}
else if (at.Kind is RegexNodeKind.Set or RegexNodeKind.One)
{
// Cannot merge sets if L or I options differ, or if either are negated.
optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
if (at.Kind == RegexNodeKind.Set)
{
if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str!))
{
wasLastSet = true;
lastNodeCannotMerge = !RegexCharClass.IsMergeable(at.Str!);
optionsLast = optionsAt;
break;
}
}
else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge)
{
wasLastSet = true;
lastNodeCannotMerge = false;
optionsLast = optionsAt;
break;
}
// The last node was a Set or a One, we're a Set or One and our options are the same.
// Merge the two nodes.
j--;
prev = children[j];
RegexCharClass prevCharClass;
if (prev.Kind == RegexNodeKind.One)
{
prevCharClass = new RegexCharClass();
prevCharClass.AddChar(prev.Ch);
}
else
{
prevCharClass = RegexCharClass.Parse(prev.Str!);
}
if (at.Kind == RegexNodeKind.One)
{
prevCharClass.AddChar(at.Ch);
}
else
{
RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!);
prevCharClass.AddCharClass(atCharClass);
}
prev.Kind = RegexNodeKind.Set;
prev.Str = prevCharClass.ToStringClass();
if ((prev.Options & RegexOptions.IgnoreCase) != 0)
{
prev.Options &= ~RegexOptions.IgnoreCase;
}
}
else if (at.Kind == RegexNodeKind.Nothing)
{
j--;
}
else
{
wasLastSet = false;
lastNodeCannotMerge = false;
}
break;
}
}
if (j < i)
{
children.RemoveRange(j, i - j);
}
}
// This function optimizes out prefix nodes from alternation branches that are
// the same across multiple contiguous branches.
// e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90)
static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
{
Debug.Assert(alternation.Kind == RegexNodeKind.Alternate);
Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
var children = (List<RegexNode>)alternation.Children;
// Only process left-to-right prefixes.
if ((alternation.Options & RegexOptions.RightToLeft) != 0)
{
return alternation;
}
// Only handle the case where each branch is a concatenation
foreach (RegexNode child in children)
{
if (child.Kind != RegexNodeKind.Concatenate || child.ChildCount() < 2)
{
return alternation;
}
}
for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++)
{
Debug.Assert(children[startingIndex].Children is List<RegexNode> { Count: >= 2 });
// Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop).
// Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
// it for non-atomic variable length loops could change behavior as each branch could otherwise have a
// different number of characters consumed by the loop based on what's after it.
RegexNode required = children[startingIndex].Child(0);
switch (required.Kind)
{
case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set:
case RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic:
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when required.M == required.N:
break;
default:
continue;
}
// Only handle the case where each branch begins with the exact same node value
int endingIndex = startingIndex + 1;
for (; endingIndex < children.Count; endingIndex++)
{
RegexNode other = children[endingIndex].Child(0);
if (required.Kind != other.Kind ||
required.Options != other.Options ||
required.M != other.M ||
required.N != other.N ||
required.Ch != other.Ch ||
required.Str != other.Str)
{
break;
}
}
if (endingIndex - startingIndex <= 1)
{
// Nothing to extract from this starting index.
continue;
}
// Remove the prefix node from every branch, adding it to a new alternation
var newAlternate = new RegexNode(RegexNodeKind.Alternate, alternation.Options);
for (int i = startingIndex; i < endingIndex; i++)
{
((List<RegexNode>)children[i].Children!).RemoveAt(0);
newAlternate.AddChild(children[i]);
}
// If this alternation is wrapped as atomic, we need to do the same for the new alternation.
if (alternation.Parent is RegexNode { Kind: RegexNodeKind.Atomic })
{
var atomic = new RegexNode(RegexNodeKind.Atomic, alternation.Options);
atomic.AddChild(newAlternate);
newAlternate = atomic;
}
// Now create a concatenation of the prefix node with the new alternation for the combined
// branches, and replace all of the branches in this alternation with that new concatenation.
var newConcat = new RegexNode(RegexNodeKind.Concatenate, alternation.Options);
newConcat.AddChild(required);
newConcat.AddChild(newAlternate);
alternation.ReplaceChild(startingIndex, newConcat);
children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1);
}
return alternation.ReplaceNodeIfUnnecessary();
}
// Removes unnecessary Empty and Nothing nodes from the alternation. A Nothing will never
// match, so it can be removed entirely, and an Empty can be removed if there's a previous
// Empty in the alternation: it's an extreme case of just having a repeated branch in an
// alternation, and while we don't check for all duplicates, checking for empty is easy.
static RegexNode RemoveRedundantEmptiesAndNothings(RegexNode node)
{
Debug.Assert(node.Kind == RegexNodeKind.Alternate);
Debug.Assert(node.ChildCount() >= 2);
var children = (List<RegexNode>)node.Children!;
int i = 0, j = 0;
bool seenEmpty = false;
while (i < children.Count)
{
RegexNode child = children[i];
switch (child.Kind)
{
case RegexNodeKind.Empty when !seenEmpty:
seenEmpty = true;
goto default;
case RegexNodeKind.Empty:
case RegexNodeKind.Nothing:
i++;
break;
default:
children[j] = children[i];
i++;
j++;
break;
}
}
children.RemoveRange(j, children.Count - j);
return node.ReplaceNodeIfUnnecessary();
}
// Analyzes all the branches of the alternation for text that's identical at the beginning
// of every branch. That text is then pulled out into its own one or multi node in a
// concatenation with the alternation (whose branches are updated to remove that prefix).
// This is valuable for a few reasons. One, it exposes potentially more text to the
// expression prefix analyzer used to influence FindFirstChar. Second, it exposes more
// potential alternation optimizations, e.g. if the same prefix is followed in two branches
// by sets that can be merged. Third, it reduces the amount of duplicated comparisons required
// if we end up backtracking into subsequent branches.
// e.g. abc|ade => a(?bc|de)
static RegexNode ExtractCommonPrefixText(RegexNode alternation)
{
Debug.Assert(alternation.Kind == RegexNodeKind.Alternate);
Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
var children = (List<RegexNode>)alternation.Children;
// To keep things relatively simple, we currently only handle:
// - Left to right (e.g. we don't process alternations in lookbehinds)
// - Branches that are one or multi nodes, or that are concatenations beginning with one or multi nodes.
// - All branches having the same options.
// Only extract left-to-right prefixes.
if ((alternation.Options & RegexOptions.RightToLeft) != 0)
{
return alternation;
}
Span<char> scratchChar = stackalloc char[1];
for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++)
{
// Process the first branch to get the maximum possible common string.
RegexNode? startingNode = children[startingIndex].FindBranchOneOrMultiStart();
if (startingNode is null)
{
return alternation;
}
RegexOptions startingNodeOptions = startingNode.Options;
scoped ReadOnlySpan<char> startingSpan = startingNode.Str.AsSpan();
if (startingNode.Kind == RegexNodeKind.One)
{
scratchChar[0] = startingNode.Ch;
startingSpan = scratchChar;
}
Debug.Assert(startingSpan.Length > 0);
// Now compare the rest of the branches against it.
int endingIndex = startingIndex + 1;
for (; endingIndex < children.Count; endingIndex++)
{
// Get the starting node of the next branch.
startingNode = children[endingIndex].FindBranchOneOrMultiStart();
if (startingNode is null || startingNode.Options != startingNodeOptions)
{
break;
}
// See if the new branch's prefix has a shared prefix with the current one.
// If it does, shorten to that; if it doesn't, bail.
if (startingNode.Kind == RegexNodeKind.One)
{
if (startingSpan[0] != startingNode.Ch)
{
break;
}
if (startingSpan.Length != 1)
{
startingSpan = startingSpan.Slice(0, 1);
}
}
else
{
Debug.Assert(startingNode.Kind == RegexNodeKind.Multi);
Debug.Assert(startingNode.Str!.Length > 0);
int minLength = Math.Min(startingSpan.Length, startingNode.Str.Length);
int c = 0;
while (c < minLength && startingSpan[c] == startingNode.Str[c]) c++;
if (c == 0)
{
break;
}
startingSpan = startingSpan.Slice(0, c);
}
}
// When we get here, we have a starting string prefix shared by all branches
// in the range [startingIndex, endingIndex).
if (endingIndex - startingIndex <= 1)
{
// There's nothing to consolidate for this starting node.
continue;
}
// We should be able to consolidate something for the nodes in the range [startingIndex, endingIndex).
Debug.Assert(startingSpan.Length > 0);
// Create a new node of the form:
// Concatenation(prefix, Alternation(each | node | with | prefix | removed))
// that replaces all these branches in this alternation.
var prefix = startingSpan.Length == 1 ?
new RegexNode(RegexNodeKind.One, startingNodeOptions, startingSpan[0]) :
new RegexNode(RegexNodeKind.Multi, startingNodeOptions, startingSpan.ToString());
var newAlternate = new RegexNode(RegexNodeKind.Alternate, startingNodeOptions);
for (int i = startingIndex; i < endingIndex; i++)
{
RegexNode branch = children[i];
ProcessOneOrMulti(branch.Kind == RegexNodeKind.Concatenate ? branch.Child(0) : branch, startingSpan);
branch = branch.Reduce();
newAlternate.AddChild(branch);
// Remove the starting text from the one or multi node. This may end up changing
// the type of the node to be Empty if the starting text matches the node's full value.
static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan<char> startingSpan)
{
if (node.Kind == RegexNodeKind.One)
{
Debug.Assert(startingSpan.Length == 1);
Debug.Assert(startingSpan[0] == node.Ch);
node.Kind = RegexNodeKind.Empty;
node.Ch = '\0';
}
else
{
Debug.Assert(node.Kind == RegexNodeKind.Multi);
Debug.Assert(node.Str.AsSpan().StartsWith(startingSpan, StringComparison.Ordinal));
if (node.Str!.Length == startingSpan.Length)
{
node.Kind = RegexNodeKind.Empty;
node.Str = null;
}
else if (node.Str.Length - 1 == startingSpan.Length)
{
node.Kind = RegexNodeKind.One;
node.Ch = node.Str[node.Str.Length - 1];
node.Str = null;
}
else
{
node.Str = node.Str.Substring(startingSpan.Length);
}
}
}
}
if (alternation.Parent is RegexNode parent && parent.Kind == RegexNodeKind.Atomic)
{
var atomic = new RegexNode(RegexNodeKind.Atomic, startingNodeOptions);
atomic.AddChild(newAlternate);
newAlternate = atomic;
}
var newConcat = new RegexNode(RegexNodeKind.Concatenate, startingNodeOptions);
newConcat.AddChild(prefix);
newConcat.AddChild(newAlternate);
alternation.ReplaceChild(startingIndex, newConcat);
children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1);
}
return alternation.ChildCount() == 1 ? alternation.Child(0) : alternation;
}
}
/// <summary>
/// Finds the starting one or multi of the branch, if it has one; otherwise, returns null.
/// For simplicity, this only considers branches that are One or Multi, or a Concatenation
/// beginning with a One or Multi. We don't traverse more than one level to avoid the
/// complication of then having to later update that hierarchy when removing the prefix,
/// but it could be done in the future if proven beneficial enough.
/// </summary>
public RegexNode? FindBranchOneOrMultiStart()
{
RegexNode branch = Kind == RegexNodeKind.Concatenate ? Child(0) : this;
return branch.Kind is RegexNodeKind.One or RegexNodeKind.Multi ? branch : null;
}
/// <summary>Gets the character that begins a One or Multi.</summary>
public char FirstCharOfOneOrMulti()
{
Debug.Assert(Kind is RegexNodeKind.One or RegexNodeKind.Multi || (IsOneFamily && M > 0));
Debug.Assert((Options & RegexOptions.RightToLeft) == 0);
return IsOneFamily ? Ch : Str![0];
}
/// <summary>Finds the guaranteed beginning literal(s) of the node, or null if none exists.</summary>
public RegexNode? FindStartingLiteralNode(bool allowZeroWidth = true)
{
RegexNode? node = this;
while (true)
{
if (node is not null && (node.Options & RegexOptions.RightToLeft) == 0)
{
switch (node.Kind)
{
case RegexNodeKind.One:
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when node.M > 0:
case RegexNodeKind.Notone:
case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when node.M > 0:
case RegexNodeKind.Set:
case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when node.M > 0:
case RegexNodeKind.Multi:
return node;
case RegexNodeKind.Atomic:
case RegexNodeKind.Concatenate:
case RegexNodeKind.Capture:
case RegexNodeKind.Group:
case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M > 0:
case RegexNodeKind.PositiveLookaround when allowZeroWidth:
node = node.Child(0);
continue;
}
}
return null;
}
}
/// <summary>Finds the guaranteed beginning literal(s) of the node, or null if none exists.</summary>
/// <returns>
/// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
/// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
/// </returns>
public StartingLiteralData? FindStartingLiteral()
{
if (FindStartingLiteralNode() is RegexNode node)
{
switch (node.Kind)
{
case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy:
return new StartingLiteralData(range: (node.Ch, node.Ch), negated: false);
case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy:
return new StartingLiteralData(range: (node.Ch, node.Ch), negated: true);
case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy:
if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive) &&
(highInclusive - lowInclusive) > 1) // prefer IndexOfAny for 1 or 2 elements as an optimization
{
Debug.Assert(lowInclusive < highInclusive);
return new StartingLiteralData(range: (lowInclusive, highInclusive), negated: RegexCharClass.IsNegated(node.Str!));
}
Span<char> setChars = stackalloc char[128];
int numChars;
if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
{
return new StartingLiteralData(setChars: setChars.Slice(0, numChars).ToString(), negated: RegexCharClass.IsNegated(node.Str!));
}
break;
case RegexNodeKind.Multi:
return new StartingLiteralData(@string: node.Str);
}
}
return null;
}
/// <summary>Data about a starting literal as returned by <see cref="FindStartingLiteral"/>.</summary>
public readonly struct StartingLiteralData
{
public readonly (char LowInclusive, char HighInclusive) Range;
public readonly string? String;
public readonly string? SetChars;
public readonly bool Negated;
public StartingLiteralData((char LowInclusive, char HighInclusive) range, bool negated)
{
Range = range;
Negated = negated;
}
public StartingLiteralData(string? @string)
{
Debug.Assert(@string is not null);
String = @string;
}
public StartingLiteralData(string? setChars, bool negated)
{
Debug.Assert(setChars is not null);
SetChars = setChars;
Negated = negated;
}
}
/// <summary>
/// Optimizes a concatenation by coalescing adjacent characters and strings,
/// coalescing adjacent loops, converting loops to be atomic where applicable,
/// and removing the concatenation itself if it's unnecessary.
/// </summary>
private RegexNode ReduceConcatenation()
{
Debug.Assert(Kind == RegexNodeKind.Concatenate);
// If the concat node has zero or only one child, get rid of the concat.
switch (ChildCount())
{
case 0:
return new RegexNode(RegexNodeKind.Empty, Options);
case 1:
return Child(0);
}
// If any node in the concatenation is a Nothing, the concatenation itself is a Nothing.
int childCount = ChildCount();
for (int i = 0; i < childCount; i++)
{
RegexNode child = Child(i);
if (child.Kind == RegexNodeKind.Nothing)
{
return child;
}
}
// Coalesce adjacent loops. This helps to minimize work done by the interpreter, minimize code gen,
// and also help to reduce catastrophic backtracking.
ReduceConcatenationWithAdjacentLoops();
// Coalesce adjacent characters/strings. This is done after the adjacent loop coalescing so that
// a One adjacent to both a Multi and a Loop prefers being folded into the Loop rather than into
// the Multi. Doing so helps with auto-atomicity when it's later applied.
ReduceConcatenationWithAdjacentStrings();
// If the concatenation is now empty, return an empty node, or if it's got a single child, return that child.
// Otherwise, return this.
return ReplaceNodeIfUnnecessary();
}
/// <summary>
/// Combine adjacent characters/strings.
/// e.g. (?:abc)(?:def) -> abcdef
/// </summary>
private void ReduceConcatenationWithAdjacentStrings()
{
Debug.Assert(Kind == RegexNodeKind.Concatenate);
Debug.Assert(Children is List<RegexNode>);
bool wasLastString = false;
RegexOptions optionsLast = 0;
int i, j;
List<RegexNode> children = (List<RegexNode>)Children!;
for (i = 0, j = 0; i < children.Count; i++, j++)
{
RegexNode at = children[i];
if (j < i)
{
children[j] = at;
}
if (at.Kind == RegexNodeKind.Concatenate &&
((at.Options & RegexOptions.RightToLeft) == (Options & RegexOptions.RightToLeft)))
{
if (at.Children is List<RegexNode> atChildren)
{
for (int k = 0; k < atChildren.Count; k++)
{
atChildren[k].Parent = this;
}
children.InsertRange(i + 1, atChildren);
}
else
{
RegexNode atChild = (RegexNode)at.Children!;
atChild.Parent = this;
children.Insert(i + 1, atChild);
}
j--;
}
else if (at.Kind is RegexNodeKind.Multi or RegexNodeKind.One)
{
// Cannot merge strings if L or I options differ
RegexOptions optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
if (!wasLastString || optionsLast != optionsAt)
{
wasLastString = true;
optionsLast = optionsAt;
continue;
}
RegexNode prev = children[--j];
if (prev.Kind == RegexNodeKind.One)
{
prev.Kind = RegexNodeKind.Multi;
prev.Str = prev.Ch.ToString();
}
prev.Str = (optionsAt & RegexOptions.RightToLeft) == 0 ?
((at.Kind == RegexNodeKind.One) ? $"{prev.Str}{at.Ch}" : prev.Str + at.Str) :
((at.Kind == RegexNodeKind.One) ? $"{at.Ch}{prev.Str}" : at.Str + prev.Str);
}
else if (at.Kind == RegexNodeKind.Empty)
{
j--;
}
else
{
wasLastString = false;
}
}
if (j < i)
{
children.RemoveRange(j, i - j);
}
}
/// <summary>
/// Combine adjacent loops.
/// e.g. a*a*a* => a*
/// e.g. a+ab => a{2,}b
/// </summary>
private void ReduceConcatenationWithAdjacentLoops()
{
Debug.Assert(Kind == RegexNodeKind.Concatenate);
Debug.Assert(Children is List<RegexNode>);
var children = (List<RegexNode>)Children!;
int current = 0, next = 1, nextSave = 1;
while (next < children.Count)
{
RegexNode currentNode = children[current];
RegexNode nextNode = children[next];
if (currentNode.Options == nextNode.Options)
{
static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax)
{
// We shouldn't have an infinite minimum; bail if we find one. Also check for the
// degenerate case where we'd make the min overflow or go infinite when it wasn't already.
if (nodeMin == int.MaxValue ||
nextMin == int.MaxValue ||
(uint)nodeMin + (uint)nextMin >= int.MaxValue)
{
return false;
}
// Similar overflow / go infinite check for max (which can be infinite).
if (nodeMax != int.MaxValue &&
nextMax != int.MaxValue &&
(uint)nodeMax + (uint)nextMax >= int.MaxValue)
{
return false;
}
return true;
}
switch (currentNode.Kind)
{
// Coalescing a loop with its same type
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when nextNode.Kind == currentNode.Kind && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when nextNode.Kind == currentNode.Kind && currentNode.Str == nextNode.Str:
if (nextNode.M > 0 &&
currentNode.Kind is RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic)
{
// Atomic loops can only be combined if the second loop has no lower bound, as if it has a lower bound,
// combining them changes behavior. Uncombined, the first loop can consume all matching items;
// the second loop might then not be able to meet its minimum and fail. But if they're combined, the combined
// minimum of the sole loop could now be met, introducing matches where there shouldn't have been any.
break;
}
if (!CanCombineCounts(currentNode.M, currentNode.N, nextNode.M, nextNode.N))
{
break;
}
currentNode.M += nextNode.M;
if (currentNode.N != int.MaxValue)
{
currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : currentNode.N + nextNode.N;
}
next++;
continue;
// Coalescing a loop with an additional item of the same type
case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.One && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy when nextNode.Kind == RegexNodeKind.Notone && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Setloop or RegexNodeKind.Setlazy when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str:
if (CanCombineCounts(currentNode.M, currentNode.N, 1, 1))
{
currentNode.M++;
if (currentNode.N != int.MaxValue)
{
currentNode.N++;
}
next++;
continue;
}
break;
// Coalescing a loop with a subsequent string
case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when
nextNode.Kind == RegexNodeKind.Multi &&
(nextNode.Options & RegexOptions.RightToLeft) == 0 && // RTL multi nodes don't have their text reversed, and it's not worth the code to optimize further
currentNode.Ch == nextNode.Str![0]:
{
// Determine how many of the multi's characters can be combined.
// We already checked for the first, so we know it's at least one.
int matchingCharsInMulti = 1;
while (matchingCharsInMulti < nextNode.Str.Length && currentNode.Ch == nextNode.Str[matchingCharsInMulti])
{
matchingCharsInMulti++;
}
if (CanCombineCounts(currentNode.M, currentNode.N, matchingCharsInMulti, matchingCharsInMulti))
{
// Update the loop's bounds to include those characters from the multi
currentNode.M += matchingCharsInMulti;
if (currentNode.N != int.MaxValue)
{
currentNode.N += matchingCharsInMulti;
}
// If it was the full multi, skip/remove the multi and continue processing this loop.
if (nextNode.Str.Length == matchingCharsInMulti)
{
next++;
continue;
}
// Otherwise, trim the characters from the multiple that were absorbed into the loop.
// If it now only has a single character, it becomes a One.
Debug.Assert(matchingCharsInMulti < nextNode.Str.Length);
if (nextNode.Str.Length - matchingCharsInMulti == 1)
{
nextNode.Kind = RegexNodeKind.One;
nextNode.Ch = nextNode.Str[nextNode.Str.Length - 1];
nextNode.Str = null;
}
else
{
nextNode.Str = nextNode.Str.Substring(matchingCharsInMulti);
}
}
}
break;
// NOTE: We could add support for coalescing a string with a subsequent loop, but the benefits of that
// are limited. Pulling a subsequent string's prefix back into the loop helps with making the loop atomic,
// but if the loop is after the string, pulling the suffix of the string forward into the loop may actually
// be a deoptimization as those characters could end up matching more slowly as part of loop matching.
// Coalescing an individual item with a loop.
case RegexNodeKind.One when (nextNode.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy) && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Notone when (nextNode.Kind is RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy) && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Set when (nextNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy) && currentNode.Str == nextNode.Str:
if (CanCombineCounts(1, 1, nextNode.M, nextNode.N))
{
currentNode.Kind = nextNode.Kind;
currentNode.M = nextNode.M + 1;
currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : nextNode.N + 1;
next++;
continue;
}
break;
// Coalescing an individual item with another individual item.
// We don't coalesce adjacent One nodes into a Oneloop as we'd rather they be joined into a Multi.
case RegexNodeKind.Notone when nextNode.Kind == currentNode.Kind && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Set when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str:
currentNode.MakeRep(RegexNodeKind.Oneloop, 2, 2);
next++;
continue;
}
}
children[nextSave++] = children[next];
current = next;
next++;
}
if (nextSave < children.Count)
{
children.RemoveRange(nextSave, children.Count - nextSave);
}
}
/// <summary>
/// Finds {one/notone/set}loop nodes in the concatenation that can be automatically upgraded
/// to {one/notone/set}loopatomic nodes. Such changes avoid potential useless backtracking.
/// e.g. A*B (where sets A and B don't overlap) => (?>A*)B.
/// </summary>
private void FindAndMakeLoopsAtomic()
{
Debug.Assert((Options & RegexOptions.NonBacktracking) == 0, "Atomic groups aren't supported and don't help performance with NonBacktracking");
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
// If we're too deep on the stack, give up optimizing further.
return;
}
if ((Options & RegexOptions.RightToLeft) != 0)
{
// RTL is so rare, we don't need to spend additional time/code optimizing for it.
return;
}
// For all node types that have children, recur into each of those children.
int childCount = ChildCount();
if (childCount != 0)
{
for (int i = 0; i < childCount; i++)
{
Child(i).FindAndMakeLoopsAtomic();
}
}
// If this isn't a concatenation, nothing more to do.
if (Kind is not RegexNodeKind.Concatenate)
{
return;
}
// This is a concatenation. Iterate through each pair of nodes in the concatenation seeing whether we can
// make the first node (or its right-most child) atomic based on the second node (or its left-most child).
Debug.Assert(Children is List<RegexNode>);
var children = (List<RegexNode>)Children;
for (int i = 0; i < childCount - 1; i++)
{
ProcessNode(children[i], children[i + 1]);
static void ProcessNode(RegexNode node, RegexNode subsequent)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
// If we can't recur further, just stop optimizing.
return;
}
// Skip down the node past irrelevant nodes.
while (true)
{
// We can always recur into captures and into the last node of concatenations.
if (node.Kind is RegexNodeKind.Capture or RegexNodeKind.Concatenate)
{
node = node.Child(node.ChildCount() - 1);
continue;
}
// For loops with at least one guaranteed iteration, we can recur into them, but
// we need to be careful not to just always do so; the ending node of a loop can only
// be made atomic if what comes after the loop but also the beginning of the loop are
// compatible for the optimization.
if (node.Kind == RegexNodeKind.Loop)
{
RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic();
if (loopDescendent != null)
{
node = loopDescendent;
continue;
}
}
// Can't skip any further.
break;
}
// If the node can be changed to atomic based on what comes after it, do so.
switch (node.Kind)
{
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: true, allowLazy: false):
// The greedy loop doesn't overlap with what comes after it, which means giving anything it matches back will not
// help the overall match to succeed, which means it can simply become atomic to match as much as possible. The call
// to CanBeMadeAtomic passes iterateNullableSubsequent=true because, in a pattern like a*b*c*, when analyzing a*, we
// want to examine the b* and the c* rather than just giving up after seeing that b* is nullable; in order to make
// the a* atomic, we need to know that anything that could possibly come after the loop doesn't overlap.
node.MakeLoopAtomic();
break;
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: false, allowLazy: true):
// The lazy loop doesn't overlap with what comes after it, which means it needs to match as much as its allowed
// to match in order for there to be a possibility that what comes next matches (if it doesn't match as much
// as it's allowed and there was still more it could match, then what comes next is guaranteed to not match,
// since it doesn't match any of the same things the loop matches). We don't want to just make the lazy loop
// atomic, as an atomic lazy loop matches as little as possible, not as much as possible. Instead, we want to
// make the lazy loop into an atomic greedy loop. Note that when we check CanBeMadeAtomic, we need to set
// "iterateNullableSubsequent" to false so that we only inspect non-nullable subsequent nodes. For example,
// given a pattern like a*?b, we want to upgrade that loop to being greedy atomic, e.g. (?>a*)b. But given a
// pattern like a*?b*, the subsequent node is nullable, which means it doesn't have to be part of a match, which
// means the a*? could match by itself, in which case as it's lazy it needs to match as few a's as possible, e.g.
// a+?b* against the input "aaaab" should match "a", not "aaaa" nor "aaaab". (Technically for lazy, we only need to prevent
// walking off the end of the pattern, but it's not currently worth complicating the implementation for that case.)
// allowLazy is set to true so that the implementation will analyze rather than ignore this node; generally lazy nodes
// are ignored due to making them atomic not generally being a sound change, but here we're explicitly choosing to
// given the circumstances.
node.Kind -= RegexNodeKind.Onelazy - RegexNodeKind.Oneloop; // lazy to greedy
node.MakeLoopAtomic();
break;
case RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional:
// In the case of alternation, we can't change the alternation node itself
// based on what comes after it (at least not with more complicated analysis
// that factors in all branches together), but we can look at each individual
// branch, and analyze ending loops in each branch individually to see if they
// can be made atomic. Then if we do end up backtracking into the alternation,
// we at least won't need to backtrack into that loop. The same is true for
// conditionals, though we don't want to process the condition expression
// itself, as it's already considered atomic and handled as part of ReduceExpressionConditional.
{
int alternateBranches = node.ChildCount();
for (int b = node.Kind == RegexNodeKind.ExpressionConditional ? 1 : 0; b < alternateBranches; b++)
{
ProcessNode(node.Child(b), subsequent);
}
}
break;
}
}
}
}
/// <summary>
/// Recurs into the last expression of a loop node, looking to see if it can find a node
/// that could be made atomic _assuming_ the conditions exist for it with the loop's ancestors.
/// </summary>
/// <returns>The found node that should be explored further for auto-atomicity; null if it doesn't exist.</returns>
private RegexNode? FindLastExpressionInLoopForAutoAtomic()
{
RegexNode node = this;
Debug.Assert(node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop);
// Start by looking at the loop's sole child.
node = node.Child(0);
// Skip past captures.
while (node.Kind == RegexNodeKind.Capture)
{
node = node.Child(0);
}
// If the loop's body is a concatenate, we can skip to its last child iff that
// last child doesn't conflict with the first child, since this whole concatenation
// could be repeated, such that the first node ends up following the last. For
// example, in the expression (a+[def])*, the last child is [def] and the first is
// a+, which can't possibly overlap with [def]. In contrast, if we had (a+[ade])*,
// [ade] could potentially match the starting 'a'.
if (node.Kind == RegexNodeKind.Concatenate)
{
int concatCount = node.ChildCount();
RegexNode lastConcatChild = node.Child(concatCount - 1);
if (CanBeMadeAtomic(lastConcatChild, node.Child(0), iterateNullableSubsequent: false, allowLazy: false))
{
return lastConcatChild;
}
}
// Otherwise, the loop has nothing that can participate in auto-atomicity.
return null;
}
/// <summary>Optimizations for positive and negative lookaheads/behinds.</summary>
private RegexNode ReduceLookaround()
{
Debug.Assert(Kind is RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround);
Debug.Assert(ChildCount() == 1);
// A lookaround is a zero-width atomic assertion.
// As it's atomic, nothing will backtrack into it, and we can
// eliminate any ending backtracking from it.
EliminateEndingBacktracking();
// A positive lookaround wrapped around an empty is a nop, and we can reduce it
// to simply Empty. A developer typically doesn't write this, but rather it evolves
// due to optimizations resulting in empty.
// A negative lookaround wrapped around an empty child, i.e. (?!), is
// sometimes used as a way to insert a guaranteed no-match into the expression,
// often as part of a conditional. We can reduce it to simply Nothing.
if (Child(0).Kind == RegexNodeKind.Empty)
{
Kind = Kind == RegexNodeKind.PositiveLookaround ? RegexNodeKind.Empty : RegexNodeKind.Nothing;
Children = null;
}
return this;
}
/// <summary>Optimizations for backreference conditionals.</summary>
private RegexNode ReduceBackreferenceConditional()
{
Debug.Assert(Kind == RegexNodeKind.BackreferenceConditional);
Debug.Assert(ChildCount() is 1 or 2);
// This isn't so much an optimization as it is changing the tree for consistency. We want
// all engines to be able to trust that every backreference conditional will have two children,
// even though it's optional in the syntax. If it's missing a "not matched" branch,
// we add one that will match empty.
if (ChildCount() == 1)
{
AddChild(new RegexNode(RegexNodeKind.Empty, Options));
}
return this;
}
/// <summary>Optimizations for expression conditionals.</summary>
private RegexNode ReduceExpressionConditional()
{
Debug.Assert(Kind == RegexNodeKind.ExpressionConditional);
Debug.Assert(ChildCount() is 2 or 3);
// This isn't so much an optimization as it is changing the tree for consistency. We want
// all engines to be able to trust that every expression conditional will have three children,
// even though it's optional in the syntax. If it's missing a "not matched" branch,
// we add one that will match empty.
if (ChildCount() == 2)
{
AddChild(new RegexNode(RegexNodeKind.Empty, Options));
}
// It's common for the condition to be an explicit positive lookahead, as specifying
// that eliminates any ambiguity in syntax as to whether the expression is to be matched
// as an expression or to be a reference to a capture group. After parsing, however,
// there's no ambiguity, and we can remove an extra level of positive lookahead, as the
// engines need to treat the condition as a zero-width positive, atomic assertion regardless.
RegexNode condition = Child(0);
if (condition.Kind == RegexNodeKind.PositiveLookaround && (condition.Options & RegexOptions.RightToLeft) == 0)
{
ReplaceChild(0, condition.Child(0));
}
// We can also eliminate any ending backtracking in the condition, as the condition
// is considered to be a positive lookahead, which is an atomic zero-width assertion.
condition = Child(0);
condition.EliminateEndingBacktracking();
return this;
}
/// <summary>Determines whether a node can be switched to an atomic loop.</summary>
/// <param name="node">The node being examined to determine whether it could be made atomic.</param>
/// <param name="subsequent">The node following <paramref name="node"/>, used to determine whether it overlaps.</param>
/// <param name="iterateNullableSubsequent">Whether to allow examining nodes beyond <paramref name="subsequent"/> if <paramref name="subsequent"/> is nullable.</param>
/// <param name="allowLazy">Whether lazy loops in addition to greedy loops should be considered for atomicity.</param>
private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool iterateNullableSubsequent, bool allowLazy)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
// If we can't recur further, just stop optimizing.
return false;
}
// In most case, we'll simply check the node against whatever subsequent is. However, in case
// subsequent ends up being a loop with a min bound of 0, we'll also need to evaluate the node
// against whatever comes after subsequent. In that case, we'll walk the tree to find the
// next subsequent, and we'll loop around against to perform the comparison again.
while (true)
{
// Skip the successor down to the closest node that's guaranteed to follow it.
int childCount;
while ((childCount = subsequent.ChildCount()) > 0)
{
Debug.Assert(subsequent.Kind != RegexNodeKind.Group);
switch (subsequent.Kind)
{
case RegexNodeKind.Concatenate:
case RegexNodeKind.Capture:
case RegexNodeKind.Atomic:
case RegexNodeKind.PositiveLookaround when (subsequent.Options & RegexOptions.RightToLeft) == 0: // only lookaheads, not lookbehinds (represented as RTL PositiveLookaround nodes)
case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when subsequent.M > 0:
subsequent = subsequent.Child(0);
continue;
}
break;
}
// If the current node's options don't match the subsequent node, then we cannot make it atomic.
// This applies to RightToLeft for lookbehinds, as well as patterns that enable/disable global flags in the middle of the pattern.
if (node.Options != subsequent.Options)
{
return false;
}
// If the successor is an alternation, all of its children need to be evaluated, since any of them
// could come after this node. If any of them fail the optimization, then the whole node fails.
// This applies to expression conditionals as well, as long as they have both a yes and a no branch (if there's
// only a yes branch, we'd need to also check whatever comes after the conditional). It doesn't apply to
// backreference conditionals, as the condition itself is unknown statically and could overlap with the
// loop being considered for atomicity.
switch (subsequent.Kind)
{
case RegexNodeKind.Alternate:
case RegexNodeKind.ExpressionConditional when childCount == 3: // condition, yes, and no branch
for (int i = 0; i < childCount; i++)
{
if (!CanBeMadeAtomic(node, subsequent.Child(i), iterateNullableSubsequent, allowLazy: false))
{
return false;
}
}
return true;
}
// If this node is a {one/notone/set}loop, see if it overlaps with its successor in the concatenation.
// If it doesn't, then we can upgrade it to being a {one/notone/set}loopatomic.
// Doing so avoids unnecessary backtracking.
switch (node.Kind)
{
case RegexNodeKind.Oneloop:
case RegexNodeKind.Onelazy when allowLazy:
switch (subsequent.Kind)
{
case RegexNodeKind.One when node.Ch != subsequent.Ch:
case RegexNodeKind.Notone when node.Ch == subsequent.Ch:
case RegexNodeKind.Set when !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M > 0 && node.Ch != subsequent.Ch:
case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M > 0 && node.Ch == subsequent.Ch:
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M > 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
case RegexNodeKind.Multi when node.Ch != subsequent.Str![0]:
case RegexNodeKind.End:
case RegexNodeKind.EndZ or RegexNodeKind.Eol when node.Ch != '\n':
return true;
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch:
case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch):
case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch):
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
break;
default:
return false;
}
break;
case RegexNodeKind.Notoneloop:
case RegexNodeKind.Notonelazy when allowLazy:
switch (subsequent.Kind)
{
case RegexNodeKind.One when node.Ch == subsequent.Ch:
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M > 0 && node.Ch == subsequent.Ch:
case RegexNodeKind.Multi when node.Ch == subsequent.Str![0]:
case RegexNodeKind.End:
return true;
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
break;
default:
return false;
}
break;
case RegexNodeKind.Setloop:
case RegexNodeKind.Setlazy when allowLazy:
switch (subsequent.Kind)
{
case RegexNodeKind.One when !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
case RegexNodeKind.Set when !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M > 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M > 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
case RegexNodeKind.Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!):
case RegexNodeKind.End:
case RegexNodeKind.EndZ or RegexNodeKind.Eol when !RegexCharClass.CharInClass('\n', node.Str!):
return true;
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
break;
default:
return false;
}
break;
default:
return false;
}
// We only get here if the node could be made atomic based on subsequent but subsequent has a lower bound of zero
// and thus we need to move subsequent to be the next node in sequence and loop around to try again.
if (!iterateNullableSubsequent)
{
return false;
}
// To be conservative, we only walk up through a very limited set of constructs (even though we may have walked
// down through more, like loops), looking for the next concatenation that we're not at the end of, at
// which point subsequent becomes whatever node is next in that concatenation.
while (true)
{
RegexNode? parent = subsequent.Parent;
switch (parent?.Kind)
{
case RegexNodeKind.Atomic:
case RegexNodeKind.Alternate:
case RegexNodeKind.Capture:
subsequent = parent;
continue;
case RegexNodeKind.Concatenate:
var peers = (List<RegexNode>)parent.Children!;
int currentIndex = peers.IndexOf(subsequent);
Debug.Assert(currentIndex >= 0, "Node should have been in its parent's child list");
if (currentIndex + 1 == peers.Count)
{
subsequent = parent;
continue;
}
else
{
subsequent = peers[currentIndex + 1];
break;
}
case null:
// If we hit the root, we're at the end of the expression, at which point nothing could backtrack
// in and we can declare success.
return true;
default:
// Anything else, we don't know what to do, so we have to assume it could conflict with the loop.
return false;
}
break;
}
}
}
/// <summary>Computes a min bound on the required length of any string that could possibly match.</summary>
/// <returns>The min computed length. If the result is 0, there is no minimum we can enforce.</returns>
/// <remarks>
/// e.g. abc[def](ghijkl|mn) => 6
/// </remarks>
public int ComputeMinLength()
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
// If we can't recur further, assume there's no minimum we can enforce.
return 0;
}
switch (Kind)
{
case RegexNodeKind.One:
case RegexNodeKind.Notone:
case RegexNodeKind.Set:
// Single character.
return 1;
case RegexNodeKind.Multi:
// Every character in the string needs to match.
return Str!.Length;
case RegexNodeKind.Notonelazy:
case RegexNodeKind.Notoneloop:
case RegexNodeKind.Notoneloopatomic:
case RegexNodeKind.Onelazy:
case RegexNodeKind.Oneloop:
case RegexNodeKind.Oneloopatomic:
case RegexNodeKind.Setlazy:
case RegexNodeKind.Setloop:
case RegexNodeKind.Setloopatomic:
// One character repeated at least M times.
return M;
case RegexNodeKind.Lazyloop:
case RegexNodeKind.Loop:
// A node graph repeated at least M times.
return (int)Math.Min(int.MaxValue - 1, (long)M * Child(0).ComputeMinLength());
case RegexNodeKind.Alternate:
// The minimum required length for any of the alternation's branches.
{
int childCount = ChildCount();
Debug.Assert(childCount >= 2);
int min = Child(0).ComputeMinLength();
for (int i = 1; i < childCount && min > 0; i++)
{
min = Math.Min(min, Child(i).ComputeMinLength());
}
return min;
}
case RegexNodeKind.BackreferenceConditional:
// Minimum of its yes and no branches. The backreference doesn't add to the length.
return Math.Min(Child(0).ComputeMinLength(), Child(1).ComputeMinLength());
case RegexNodeKind.ExpressionConditional:
// Minimum of its yes and no branches. The condition is a zero-width assertion.
return Math.Min(Child(1).ComputeMinLength(), Child(2).ComputeMinLength());
case RegexNodeKind.Concatenate:
// The sum of all of the concatenation's children.
{
long sum = 0;
int childCount = ChildCount();
for (int i = 0; i < childCount; i++)
{
sum += Child(i).ComputeMinLength();
}
return (int)Math.Min(int.MaxValue - 1, sum);
}
case RegexNodeKind.Atomic:
case RegexNodeKind.Capture:
case RegexNodeKind.Group:
// For groups, we just delegate to the sole child.
Debug.Assert(ChildCount() == 1);
return Child(0).ComputeMinLength();
case RegexNodeKind.Empty:
case RegexNodeKind.Nothing:
case RegexNodeKind.UpdateBumpalong:
// Nothing to match. In the future, we could potentially use Nothing to say that the min length
// is infinite, but that would require a different structure, as that would only apply if the
// Nothing match is required in all cases (rather than, say, as one branch of an alternation).
case RegexNodeKind.Beginning:
case RegexNodeKind.Bol:
case RegexNodeKind.Boundary:
case RegexNodeKind.ECMABoundary:
case RegexNodeKind.End:
case RegexNodeKind.EndZ:
case RegexNodeKind.Eol:
case RegexNodeKind.NonBoundary:
case RegexNodeKind.NonECMABoundary:
case RegexNodeKind.Start:
case RegexNodeKind.NegativeLookaround:
case RegexNodeKind.PositiveLookaround:
// Zero-width
case RegexNodeKind.Backreference:
// Requires matching data available only at run-time. In the future, we could choose to find
// and follow the capture group this aligns with, while being careful not to end up in an
// infinite cycle.
return 0;
default:
Debug.Fail($"Unknown node: {Kind}");
goto case RegexNodeKind.Empty;
}
}
/// <summary>Computes a maximum length of any string that could possibly match.</summary>
/// <returns>The maximum length of any string that could possibly match, or null if the length may not always be the same.</returns>
/// <remarks>
/// e.g. abc[def](gh|ijklmnop) => 12
/// </remarks>
public int? ComputeMaxLength()
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
// If we can't recur further, assume there's no maximum we can enforce.
return null;
}
switch (Kind)
{
case RegexNodeKind.One:
case RegexNodeKind.Notone:
case RegexNodeKind.Set:
// Single character.
return 1;
case RegexNodeKind.Multi:
// Every character in the string needs to match.
return Str!.Length;
case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or
RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or
RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic:
// Return the max number of iterations if there's an upper bound, or null if it's infinite
return N == int.MaxValue ? null : N;
case RegexNodeKind.Loop or RegexNodeKind.Lazyloop:
if (N != int.MaxValue)
{
// A node graph repeated a fixed number of times
if (Child(0).ComputeMaxLength() is int childMaxLength)
{
long maxLength = (long)N * childMaxLength;
if (maxLength < int.MaxValue)
{
return (int)maxLength;
}
}
}
return null;
case RegexNodeKind.Alternate:
// The maximum length of any child branch, as long as they all have one.
{
int childCount = ChildCount();
Debug.Assert(childCount >= 2);
if (Child(0).ComputeMaxLength() is not int maxLength)
{
return null;
}
for (int i = 1; i < childCount; i++)
{
if (Child(i).ComputeMaxLength() is not int next)
{
return null;
}
maxLength = Math.Max(maxLength, next);
}
return maxLength;
}
case RegexNodeKind.BackreferenceConditional:
case RegexNodeKind.ExpressionConditional:
// The maximum length of either child branch, as long as they both have one.. The condition for an expression conditional is a zero-width assertion.
{
int i = Kind == RegexNodeKind.BackreferenceConditional ? 0 : 1;
return Child(i).ComputeMaxLength() is int yes && Child(i + 1).ComputeMaxLength() is int no ?
Math.Max(yes, no) :
null;
}
case RegexNodeKind.Concatenate:
// The sum of all of the concatenation's children's max lengths, as long as they all have one.
{
long sum = 0;
int childCount = ChildCount();
for (int i = 0; i < childCount; i++)
{
if (Child(i).ComputeMaxLength() is not int length)
{
return null;
}
sum += length;
}
if (sum < int.MaxValue)
{
return (int)sum;
}
return null;
}
case RegexNodeKind.Atomic:
case RegexNodeKind.Capture:
// For groups, we just delegate to the sole child.
Debug.Assert(ChildCount() == 1);
return Child(0).ComputeMaxLength();
case RegexNodeKind.Empty:
case RegexNodeKind.Nothing:
case RegexNodeKind.UpdateBumpalong:
case RegexNodeKind.Beginning:
case RegexNodeKind.Bol:
case RegexNodeKind.Boundary:
case RegexNodeKind.ECMABoundary:
case RegexNodeKind.End:
case RegexNodeKind.EndZ:
case RegexNodeKind.Eol:
case RegexNodeKind.NonBoundary:
case RegexNodeKind.NonECMABoundary:
case RegexNodeKind.Start:
case RegexNodeKind.PositiveLookaround:
case RegexNodeKind.NegativeLookaround:
// Zero-width
return 0;
case RegexNodeKind.Backreference:
// Requires matching data available only at run-time. In the future, we could choose to find
// and follow the capture group this aligns with, while being careful not to end up in an
// infinite cycle.
return null;
default:
Debug.Fail($"Unknown node: {Kind}");
goto case RegexNodeKind.Empty;
}
}
/// <summary>
/// Determines whether the specified child index of a concatenation begins a sequence whose values
/// should be used to perform an ordinal case-insensitive comparison.
/// </summary>
/// <param name="childIndex">The index of the child with which to start the sequence.</param>
/// <param name="exclusiveChildBound">The exclusive upper bound on the child index to iterate to.</param>
/// <param name="nodesConsumed">How many nodes make up the sequence, if any.</param>
/// <param name="caseInsensitiveString">The string to use for an ordinal case-insensitive comparison, if any.</param>
/// <param name="consumeZeroWidthNodes">
/// Defaults to false. When false, the consumer needs the semantics of matching the produced string to fully represent
/// the semantics of all the consumed nodes, which means nodes can be consumed iff they produce text that's represented
/// by the resulting string. When true, the resulting string needs to fully represent all valid matches at that position,
/// but it can have false positives, which means the resulting string doesn't need to fully represent all zero-width nodes
/// consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of
/// actual matching logic.
/// </param>
/// <returns>true if a sequence was found; otherwise, false.</returns>
public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false)
{
Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
var vsb = new ValueStringBuilder(stackalloc char[32]);
// We're looking in particular for sets of ASCII characters, so we focus only on sets with two characters in them, e.g. [Aa].
Span<char> twoChars = stackalloc char[2];
// Iterate from the child index to the exclusive upper bound.
int i = childIndex;
for (; i < exclusiveChildBound; i++)
{
RegexNode child = Child(i);
if (child.Kind is RegexNodeKind.One)
{
// We only want to include ASCII characters, and only if they don't participate in case conversion
// such that they only case to themselves and nothing other cases to them. Otherwise, including
// them would potentially cause us to match against things not allowed by the pattern.
if (child.Ch >= 128 ||
RegexCharClass.ParticipatesInCaseConversion(child.Ch))
{
break;
}
vsb.Append(child.Ch);
}
else if (child.Kind is RegexNodeKind.Multi)
{
// As with RegexNodeKind.One, the string needs to be composed solely of ASCII characters that
// don't participate in case conversion.
if (!RegexCharClass.IsAscii(child.Str.AsSpan()) ||
RegexCharClass.ParticipatesInCaseConversion(child.Str.AsSpan()))
{
break;
}
vsb.Append(child.Str);
}
else if (child.Kind is RegexNodeKind.Set ||
(child.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic && child.M == child.N))
{
// In particular we want to look for sets that contain only the upper and lowercase variant
// of the same ASCII letter.
if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(child.Str!, twoChars))
{
break;
}
vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M);
}
else if (child.Kind is RegexNodeKind.Empty)
{
// Skip over empty nodes, as they're pure nops. They would ideally have been optimized away,
// but can still remain in some situations.
}
else if (consumeZeroWidthNodes &&
// anchors
child.Kind is RegexNodeKind.Beginning or
RegexNodeKind.Bol or
RegexNodeKind.Start or
// boundaries
RegexNodeKind.Boundary or
RegexNodeKind.ECMABoundary or
RegexNodeKind.NonBoundary or
RegexNodeKind.NonECMABoundary or
// lookarounds
RegexNodeKind.NegativeLookaround or
RegexNodeKind.PositiveLookaround or
// logic
RegexNodeKind.UpdateBumpalong)
{
// Skip over zero-width nodes that might be reasonable at the beginning of or within a substring.
// We can only do these if consumeZeroWidthNodes is true, as otherwise we'd be producing a string that
// may not fully represent the semantics of this portion of the pattern.
}
else
{
break;
}
}
// If we found at least two characters, consider it a sequence found. It's possible
// they all came from the same node, so this could be a sequence of just one node.
if (vsb.Length >= 2)
{
caseInsensitiveString = vsb.ToString();
nodesConsumed = i - childIndex;
return true;
}
// No sequence found.
caseInsensitiveString = null;
nodesConsumed = 0;
vsb.Dispose();
return false;
}
/// <summary>
/// Determine whether the specified child node is the beginning of a sequence that can
/// trivially have length checks combined in order to avoid bounds checks.
/// </summary>
/// <param name="childIndex">The starting index of the child to check.</param>
/// <param name="requiredLength">The sum of all the fixed lengths for the nodes in the sequence.</param>
/// <param name="exclusiveEnd">The index of the node just after the last one in the sequence.</param>
/// <returns>true if more than one node can have their length checks combined; otherwise, false.</returns>
/// <remarks>
/// There are additional node types for which we can prove a fixed length, e.g. examining all branches
/// of an alternation and returning true if all their lengths are equal. However, the primary purpose
/// of this method is to avoid bounds checks by consolidating length checks that guard accesses to
/// strings/spans for which the JIT can see a fixed index within bounds, and alternations employ
/// patterns that defeat that (e.g. reassigning the span in question). As such, the implementation
/// remains focused on only a core subset of nodes that are a) likely to be used in concatenations and
/// b) employ simple patterns of checks.
/// </remarks>
public bool TryGetJoinableLengthCheckChildRange(int childIndex, out int requiredLength, out int exclusiveEnd)
{
Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
static bool CanJoinLengthCheck(RegexNode node) => node.Kind switch
{
RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set => true,
RegexNodeKind.Multi => true,
RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or RegexNodeKind.Oneloopatomic or
RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloopatomic or
RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic
when node.M == node.N => true,
_ => false,
};
RegexNode child = Child(childIndex);
if (CanJoinLengthCheck(child))
{
requiredLength = child.ComputeMinLength();
int childCount = ChildCount();
for (exclusiveEnd = childIndex + 1; exclusiveEnd < childCount; exclusiveEnd++)
{
child = Child(exclusiveEnd);
if (!CanJoinLengthCheck(child))
{
break;
}
requiredLength += child.ComputeMinLength();
}
if (exclusiveEnd - childIndex > 1)
{
return true;
}
}
requiredLength = 0;
exclusiveEnd = 0;
return false;
}
public RegexNode MakeQuantifier(bool lazy, int min, int max)
{
// Certain cases of repeaters (min == max) can be handled specially
if (min == max)
{
switch (max)
{
case 0:
// The node is repeated 0 times, so it's actually empty.
return new RegexNode(RegexNodeKind.Empty, Options);
case 1:
// The node is repeated 1 time, so it's not actually a repeater.
return this;
case <= MultiVsRepeaterLimit when Kind == RegexNodeKind.One:
// The same character is repeated a fixed number of times, so it's actually a multi.
// While this could remain a repeater, multis are more readily optimized later in
// processing. The counts used here in real-world expressions are invariably small (e.g. 4),
// but we set an upper bound just to avoid creating really large strings.
Debug.Assert(max >= 2);
Kind = RegexNodeKind.Multi;
Str = new string(Ch, max);
Ch = '\0';
return this;
}
}
switch (Kind)
{
case RegexNodeKind.One:
case RegexNodeKind.Notone:
case RegexNodeKind.Set:
MakeRep(lazy ? RegexNodeKind.Onelazy : RegexNodeKind.Oneloop, min, max);
return this;
default:
var result = new RegexNode(lazy ? RegexNodeKind.Lazyloop : RegexNodeKind.Loop, Options, min, max);
result.AddChild(this);
return result;
}
}
public void AddChild(RegexNode newChild)
{
newChild.Parent = this; // so that the child can see its parent while being reduced
newChild = newChild.Reduce();
newChild.Parent = this; // in case Reduce returns a different node that needs to be reparented
if (Children is null)
{
Children = newChild;
}
else if (Children is RegexNode currentChild)
{
Children = new List<RegexNode>() { currentChild, newChild };
}
else
{
((List<RegexNode>)Children).Add(newChild);
}
}
public void InsertChild(int index, RegexNode newChild)
{
Debug.Assert(Children is List<RegexNode>);
newChild.Parent = this; // so that the child can see its parent while being reduced
newChild = newChild.Reduce();
newChild.Parent = this; // in case Reduce returns a different node that needs to be reparented
((List<RegexNode>)Children).Insert(index, newChild);
}
public void ReplaceChild(int index, RegexNode newChild)
{
Debug.Assert(Children != null);
Debug.Assert(index < ChildCount());
newChild.Parent = this; // so that the child can see its parent while being reduced
newChild = newChild.Reduce();
newChild.Parent = this; // in case Reduce returns a different node that needs to be reparented
if (Children is RegexNode)
{
Children = newChild;
}
else
{
((List<RegexNode>)Children)[index] = newChild;
}
}
public RegexNode Child(int i) => Children is RegexNode child ? child : ((List<RegexNode>)Children!)[i];
public int ChildCount()
{
if (Children is null)
{
return 0;
}
if (Children is List<RegexNode> children)
{
return children.Count;
}
Debug.Assert(Children is RegexNode);
return 1;
}
// Determines whether the node supports a compilation strategy based on walking the node tree.
// Also returns a human-readable string to explain the reason (it will be emitted by the source generator, hence
// there's no need to localize).
internal bool SupportsCompilation([NotNullWhen(false)] out string? reason)
{
if ((Options & RegexOptions.NonBacktracking) != 0)
{
reason = "RegexOptions.NonBacktracking isn't supported";
return false;
}
if (ExceedsMaxDepthAllowedDepth(this, allowedDepth: 40))
{
// For the source generator, deep RegexNode trees can result in emitting C# code that exceeds C# compiler
// limitations, leading to "CS8078: An expression is too long or complex to compile". As such, we place
// an artificial limit on max tree depth in order to mitigate such issues. The allowed depth can be tweaked
// as needed; its exceedingly rare to find expressions with such deep trees. And while RegexCompiler doesn't
// have to deal with C# compiler limitations, we still want to limit max tree depth as we want to limit
// how deep recursion we'll employ as part of code generation.
reason = "the expression may result exceeding run-time or compiler limits";
return false;
}
// Supported.
reason = null;
return true;
static bool ExceedsMaxDepthAllowedDepth(RegexNode node, int allowedDepth)
{
if (allowedDepth <= 0)
{
return true;
}
int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
{
if (ExceedsMaxDepthAllowedDepth(node.Child(i), allowedDepth - 1))
{
return true;
}
}
return false;
}
}
/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
[MemberNotNullWhen(true, nameof(Str))]
public bool IsSetFamily => Kind is RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy;
/// <summary>Gets whether the node is a One/Oneloop/Oneloopatomic/Onelazy node.</summary>
public bool IsOneFamily => Kind is RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy;
/// <summary>Gets whether the node is a Notone/Notoneloop/Notoneloopatomic/Notonelazy node.</summary>
public bool IsNotoneFamily => Kind is RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy;
#if DEBUG
[ExcludeFromCodeCoverage] // Used only for debugging assistance
public override string ToString()
{
RegexNode? curNode = this;
int curChild = 0;
var sb = new StringBuilder().AppendLine(curNode.Describe());
var stack = new List<int>();
while (true)
{
if (curChild < curNode!.ChildCount())
{
stack.Add(curChild + 1);
curNode = curNode.Child(curChild);
curChild = 0;
sb.Append(new string(' ', stack.Count * 2)).Append(curNode.Describe()).AppendLine();
}
else
{
if (stack.Count == 0)
{
break;
}
curChild = stack[stack.Count - 1];
stack.RemoveAt(stack.Count - 1);
curNode = curNode.Parent;
}
}
return sb.ToString();
}
[ExcludeFromCodeCoverage] // Used only for debugging assistance
private string Describe()
{
var sb = new StringBuilder(Kind.ToString());
if ((Options & RegexOptions.ExplicitCapture) != 0) sb.Append("-C");
if ((Options & RegexOptions.IgnoreCase) != 0) sb.Append("-I");
if ((Options & RegexOptions.RightToLeft) != 0) sb.Append("-L");
if ((Options & RegexOptions.Multiline) != 0) sb.Append("-M");
if ((Options & RegexOptions.Singleline) != 0) sb.Append("-S");
if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) sb.Append("-X");
if ((Options & RegexOptions.ECMAScript) != 0) sb.Append("-E");
switch (Kind)
{
case RegexNodeKind.Oneloop:
case RegexNodeKind.Oneloopatomic:
case RegexNodeKind.Notoneloop:
case RegexNodeKind.Notoneloopatomic:
case RegexNodeKind.Onelazy:
case RegexNodeKind.Notonelazy:
case RegexNodeKind.One:
case RegexNodeKind.Notone:
sb.Append(" '").Append(RegexCharClass.DescribeChar(Ch)).Append('\'');
break;
case RegexNodeKind.Capture:
sb.Append(' ').Append($"index = {M}");
if (N != -1)
{
sb.Append($", unindex = {N}");
}
break;
case RegexNodeKind.Backreference:
case RegexNodeKind.BackreferenceConditional:
sb.Append(' ').Append($"index = {M}");
break;
case RegexNodeKind.Multi:
sb.Append(" \"");
foreach (char c in Str!)
{
sb.Append(RegexCharClass.DescribeChar(c));
}
sb.Append('"');
break;
case RegexNodeKind.Set:
case RegexNodeKind.Setloop:
case RegexNodeKind.Setloopatomic:
case RegexNodeKind.Setlazy:
sb.Append(' ').Append(RegexCharClass.DescribeSet(Str!));
break;
}
switch (Kind)
{
case RegexNodeKind.Oneloop:
case RegexNodeKind.Oneloopatomic:
case RegexNodeKind.Notoneloop:
case RegexNodeKind.Notoneloopatomic:
case RegexNodeKind.Onelazy:
case RegexNodeKind.Notonelazy:
case RegexNodeKind.Setloop:
case RegexNodeKind.Setloopatomic:
case RegexNodeKind.Setlazy:
case RegexNodeKind.Loop:
case RegexNodeKind.Lazyloop:
sb.Append(
(M == 0 && N == int.MaxValue) ? "*" :
(M == 0 && N == 1) ? "?" :
(M == 1 && N == int.MaxValue) ? "+" :
(N == int.MaxValue) ? $"{{{M}, *}}" :
(N == M) ? $"{{{M}}}" :
$"{{{M}, {N}}}");
break;
}
return sb.ToString();
}
#endif
}
}
|