|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Represents a regex matching engine that performs regex matching using symbolic derivatives.</summary>
internal abstract class SymbolicRegexMatcher
{
#if DEBUG
/// <inheritdoc cref="Regex.SaveDGML(TextWriter, int)"/>
public abstract void SaveDGML(TextWriter writer, int maxLabelLength);
/// <inheritdoc cref="Regex.SampleMatches(int, int)"/>
public abstract IEnumerable<string> SampleMatches(int k, int randomseed);
/// <inheritdoc cref="Regex.Explore(bool, bool, bool, bool, bool)"/>
public abstract void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa);
#endif
}
/// <summary>Represents a regex matching engine that performs regex matching using symbolic derivatives.</summary>
/// <typeparam name="TSet">Character set type.</typeparam>
internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher where TSet : IComparable<TSet>, IEquatable<TSet>
{
/// <summary>Sentinel value used internally by the matcher to indicate no match exists.</summary>
private const int NoMatchExists = -2;
/// <summary>Builder used to create <see cref="SymbolicRegexNode{S}"/>s while matching.</summary>
/// <remarks>
/// The builder is used to build up the DFA state space lazily, which means we need to be able to
/// produce new <see cref="SymbolicRegexNode{S}"/>s as we match. Once in NFA mode, we also use
/// the builder to produce new NFA states. The builder maintains a cache of all DFA and NFA states.
/// </remarks>
internal readonly SymbolicRegexBuilder<TSet> _builder;
/// <summary>Maps every character to its corresponding minterm ID.</summary>
private readonly MintermClassifier _mintermClassifier;
/// <summary><see cref="_pattern"/> prefixed with [0-0xFFFF]*</summary>
/// <remarks>
/// The matching engine first uses <see cref="_dotStarredPattern"/> to find whether there is a match
/// and where that match might end. Prepending the .* prefix onto the original pattern provides the DFA
/// with the ability to continue to process input characters even if those characters aren't part of
/// the match. If Regex.IsMatch is used, nothing further is needed beyond this prefixed pattern. If, however,
/// other matching operations are performed that require knowing the exact start and end of the match,
/// the engine then needs to process the pattern in reverse to find where the match actually started;
/// for that, it uses the <see cref="_reversePattern"/> and walks backwards through the input characters
/// from where <see cref="_dotStarredPattern"/> left off. At this point we know that there was a match,
/// where it started, and where it could have ended, but that ending point could be influenced by the
/// selection of the starting point. So, to find the actual ending point, the original <see cref="_pattern"/>
/// is then used from that starting point to walk forward through the input characters again to find the
/// actual end point used for the match.
/// </remarks>
internal readonly SymbolicRegexNode<TSet> _dotStarredPattern;
/// <summary>The original regex pattern.</summary>
internal readonly SymbolicRegexNode<TSet> _pattern;
/// <summary>The reverse of <see cref="_pattern"/>.</summary>
/// <remarks>
/// Determining that there is a match and where the match ends requires only <see cref="_pattern"/>.
/// But from there determining where the match began requires reversing the pattern and running
/// the matcher again, starting from the ending position. This <see cref="_reversePattern"/> caches
/// that reversed pattern used for extracting match start.
/// </remarks>
internal readonly SymbolicRegexNode<TSet> _reversePattern;
/// <summary>true iff timeout checking is enabled.</summary>
private readonly bool _checkTimeout;
/// <summary>Timeout in milliseconds. This is only used if <see cref="_checkTimeout"/> is true.</summary>
private readonly int _timeout;
/// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
private readonly RegexFindOptimizations? _findOpts;
/// <summary>
/// Dead end state to quickly return NoMatch.
/// </summary>
private readonly int _deadStateId;
/// <summary>Initial state used for vectorization.</summary>
private readonly int _initialStateId;
/// <summary>Whether the pattern contains any anchor.</summary>
private readonly bool _containsAnyAnchor;
/// <summary>Whether the pattern contains the EndZ anchor, which invalidates most optimization shortcuts.</summary>
private readonly bool _containsEndZAnchor;
/// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
private readonly MatchingState<TSet>[] _initialStates;
/// <summary>The initial states for the dot-star pattern, keyed off of the previous character kind.</summary>
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
private readonly MatchingState<TSet>[] _dotstarredInitialStates;
/// <summary>The initial states for the reverse pattern, keyed off of the previous character kind.</summary>
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
private readonly MatchingState<TSet>[] _reverseInitialStates;
/// <summary>Details on optimized processing of the reverse of the pattern to find the beginning of a match.</summary>
private readonly MatchReversalInfo<TSet> _optimizedReversalInfo;
/// <summary>Partition of the input space of sets.</summary>
private readonly TSet[] _minterms;
/// <summary>
/// Character kinds <see cref="CharKind"/> for all minterms in <see cref="_minterms"/> as well as two special
/// cases: character positions outside the input bounds and an end-of-line as the last input character.
/// </summary>
private readonly uint[] _positionKinds;
/// <summary>
/// The smallest k s.t. 2^k >= minterms.Length + 1. The "delta arrays", e.g., <see cref="_dfaDelta"/> allocate 2^k
/// consecutive slots for each state ID to represent the transitions for each minterm. The extra slot at index
/// _minterms.Length is used to represent an \n occurring at the very end of input, for supporting the \Z anchor.
/// </summary>
private readonly int _mintermsLog;
/// <summary>Number of capture groups.</summary>
private readonly int _capsize;
/// <summary>Gets whether the regular expression contains captures (beyond the implicit root-level capture).</summary>
/// <remarks>This determines whether the matcher uses the special capturing NFA simulation mode.</remarks>
internal bool HasSubcaptures => _capsize > 1;
/// <remarks>
/// Both solvers supported here, <see cref="UInt64Solver"/> and <see cref="BitVectorSolver"/> are thread safe.
/// </remarks>
private ISolver<TSet> Solver => _builder._solver;
/// <summary>Creates a new <see cref="SymbolicRegexMatcher{TSetType}"/>.</summary>
/// <param name="captureCount">The number of captures in the regular expression.</param>
/// <param name="findOptimizations">The find optimizations computed from the expression.</param>
/// <param name="bddBuilder">The <see cref="BDD"/>-based builder.</param>
/// <param name="rootBddNode">The root <see cref="BDD"/>-based node from the pattern.</param>
/// <param name="solver">The solver to use.</param>
/// <param name="matchTimeout">The match timeout to use.</param>
public static SymbolicRegexMatcher<TSet> Create(
int captureCount, RegexFindOptimizations findOptimizations,
SymbolicRegexBuilder<BDD> bddBuilder, SymbolicRegexNode<BDD> rootBddNode, ISolver<TSet> solver,
TimeSpan matchTimeout)
{
CharSetSolver charSetSolver = (CharSetSolver)bddBuilder._solver;
var builder = new SymbolicRegexBuilder<TSet>(solver, charSetSolver)
{
// The default constructor sets the following sets to empty; they're lazily-initialized when needed.
// Only if anchors are in the regex will these be set to non-empty.
_wordLetterForBoundariesSet = solver.ConvertFromBDD(bddBuilder._wordLetterForBoundariesSet, charSetSolver),
_newLineSet = solver.ConvertFromBDD(bddBuilder._newLineSet, charSetSolver)
};
// Convert the BDD-based AST to TSet-based AST
SymbolicRegexNode<TSet> rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver));
return new SymbolicRegexMatcher<TSet>(builder, rootNode, captureCount, findOptimizations, matchTimeout);
}
/// <summary>Constructs matcher for given symbolic regex.</summary>
private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
{
Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}");
_pattern = rootNode;
_builder = builder;
_checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout;
_timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms
TSet[]? solverMinterms = builder._solver.GetMinterms();
Debug.Assert(solverMinterms is not null);
_minterms = solverMinterms;
// BitOperations.Log2 gives the integer floor of the log, so the +1 below either rounds up with non-power-of-two
// minterms or adds an extra bit with power-of-two minterms. The extra slot at index _minterms.Length is used to
// represent an \n occurring at the very end of input, for supporting the \Z anchor.
_mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1;
_mintermClassifier = builder._solver is UInt64Solver bv64 ?
bv64._classifier :
((BitVectorSolver)(object)builder._solver)._classifier;
_capsize = captureCount;
// Initialize state and nullability arrays.
_stateArray = new MatchingState<TSet>[InitialDfaStateCapacity];
_stateFlagsArray = new StateFlags[InitialDfaStateCapacity];
_nullabilityArray = new byte[InitialDfaStateCapacity];
_dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog];
// Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm
// ID _minterms.Length, which is used to represent a \n at the very end of input, and another ID -1,
// which is used to represent any position outside the bounds of the input.
_positionKinds = new uint[_minterms.Length + 2];
for (int mintermId = -1; mintermId < _positionKinds.Length - 1; mintermId++)
{
_positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId);
}
// Gather optimized reversal processing information.
_optimizedReversalInfo = CreateOptimizedReversal(_pattern.Reverse(builder));
// Store the find optimizations that can be used to jump ahead to the next possible starting location.
// If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's
// handling for beginning anchors.
if (findOptimizations.IsUseful &&
findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning)
{
_findOpts = findOptimizations;
}
// Determine the number of initial states. If there's no anchor, only the default previous
// character kind 0 is ever going to be used for all initial states.
int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1;
// The loops below and how character kinds are calculated assume that the "general" character kind is zero
Debug.Assert(CharKind.General == 0);
// Assign edge case info for quick lookup
_containsAnyAnchor = _pattern._info.ContainsSomeAnchor;
_containsEndZAnchor = _pattern._info.ContainsEndZAnchor;
// Create the initial states for the original pattern.
var initialStates = new MatchingState<TSet>[statesCount];
for (uint charKind = 0; charKind < initialStates.Length; charKind++)
{
initialStates[charKind] = GetOrCreateState_NoLock(_pattern, charKind);
}
_initialStates = initialStates;
// Create the dot-star pattern (a concatenation of any* with the original pattern)
// and all of its initial states.
_dotStarredPattern = builder.CreateConcat(builder._anyStarLazy, _pattern);
var dotstarredInitialStates = new MatchingState<TSet>[statesCount];
for (uint charKind = 0; charKind < dotstarredInitialStates.Length; charKind++)
{
// Used to detect if initial state was reentered,
// but observe that the behavior from the state may ultimately depend on the previous
// input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor,
// in that sense there can be several "versions" (not more than StateCount) of the initial state.
dotstarredInitialStates[charKind] = GetOrCreateState_NoLock(_dotStarredPattern, charKind, isInitialState: true);
}
_dotstarredInitialStates = dotstarredInitialStates;
// Assign dead and initial state ids
_deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
_initialStateId = _dotstarredInitialStates[CharKind.General].Id;
// Create the reverse pattern (the original pattern in reverse order) and all of its
// initial states. Also disable backtracking simulation to ensure the reverse path from
// the final state that was found is followed. Not doing so might cause the earliest
// starting point to not be found.
_reversePattern = builder.CreateDisableBacktrackingSimulation(_pattern.Reverse(builder));
var reverseInitialStates = new MatchingState<TSet>[statesCount];
for (uint charKind = 0; charKind < reverseInitialStates.Length; charKind++)
{
reverseInitialStates[charKind] = GetOrCreateState_NoLock(_reversePattern, charKind);
}
_reverseInitialStates = reverseInitialStates;
// Maps a minterm ID to a character kind
uint CalculateMintermIdKind(int mintermId)
{
// Only patterns with anchors use anything except the general kind
if (_pattern._info.ContainsSomeAnchor)
{
// A minterm ID of -1 represents the positions before the first and after the last character
// in the input.
if (mintermId == -1)
{
return CharKind.BeginningEnd;
}
// A minterm ID of minterms.Length represents a \n at the very end of input, which is matched
// by the \Z anchor.
if ((uint)mintermId == (uint)_minterms.Length)
{
return CharKind.NewLineS;
}
TSet minterm = _minterms[mintermId];
// Examine the minterm to figure out its character kind
if (_builder._newLineSet.Equals(minterm))
{
// The minterm is a new line character
return CharKind.Newline;
}
else if (!Solver.IsEmpty(Solver.And(_builder._wordLetterForBoundariesSet, minterm)))
{
Debug.Assert(Solver.IsEmpty(Solver.And(Solver.Not(_builder._wordLetterForBoundariesSet), minterm)));
// The minterm is a subset of word letters as considered by \b and \B
return CharKind.WordLetter;
}
}
// All other minterms belong to the general kind
return CharKind.General;
}
}
/// <summary>
/// Create a PerThreadData with the appropriate parts initialized for this matcher's pattern.
/// </summary>
internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize);
/// <summary>Look up what is the character kind given a position ID</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1];
/// <summary>
/// Lookup the actual minterm based on its ID. Also get its character kind, which is a general categorization of
/// characters used for cheaply deciding the nullability of anchors.
/// </summary>
internal TSet GetMintermFromId(int mintermId)
{
TSet[] minterms = _minterms;
if ((uint)mintermId < (uint)minterms.Length)
{
return minterms[mintermId];
}
// A minterm ID of minterms.Length represents a \n at the very end of input, which is matched by the \Z anchor.
return _builder._newLineSet;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetCharKind(ReadOnlySpan<char> input, int i) =>
!_pattern._info.ContainsSomeAnchor ?
CharKind.General : // The previous character kind is irrelevant when anchors are not used.
GetPositionKind(DefaultInputReader.GetPositionId(this, input, i));
private void CheckTimeout(long timeoutOccursAt)
{
Debug.Assert(_checkTimeout);
if (Environment.TickCount64 >= timeoutOccursAt)
{
ThrowRegexTimeout();
}
void ThrowRegexTimeout() => throw new RegexMatchTimeoutException(string.Empty, string.Empty, TimeSpan.FromMilliseconds(_timeout));
}
/// <summary>Find a match.</summary>
/// <param name="mode">The mode of execution based on the regex operation being performed.</param>
/// <param name="input">The input span</param>
/// <param name="startat">The position to start search in the input span.</param>
/// <param name="perThreadData">Per thread data reused between calls.</param>
public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, int startat, PerThreadData perThreadData)
{
Debug.Assert(startat >= 0 && startat <= input.Length, $"{nameof(startat)} == {startat}, {nameof(input.Length)} == {input.Length}");
Debug.Assert(perThreadData is not null);
// If we need to perform timeout checks, store the absolute timeout value.
long timeoutOccursAt = 0;
if (_checkTimeout)
{
// Using Environment.TickCount for efficiency instead of Stopwatch -- as in the non-DFA case.
timeoutOccursAt = Environment.TickCount64 + _timeout;
}
// Phase 1:
// Determine the end point of the match. The returned index is one-past-the-end index for the characters
// in the match. Note that -1 is a valid end point for an empty match at the beginning of the input.
// It returns NoMatchExists (-2) when there is no match.
// As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find
// the position of the last b: aacaaaabbbc. It additionally records the position of the first a after
// the c as the low boundary for the starting position.
// The Z anchor and over 255 minterms are rare enough to consider them separate edge cases.
int matchEnd;
if (!_containsEndZAnchor && _mintermClassifier.ByteLookup is not null)
{
// Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments.
matchEnd = (_findOpts is not null, _containsAnyAnchor) switch
{
(true, true) => FindEndPositionOptimized<NoZAnchorFindOptimizationsInitialStateHandler, DefaultDfaNoZAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
(true, false) => FindEndPositionOptimized<NoAnchorsFindOptimizationsInitialStateHandler, NoAnchorDfaOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
(false, true) => FindEndPositionOptimized<NoOptimizationsInitialStateHandler, DefaultDfaNoZAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
(false, false) => FindEndPositionOptimized<NoOptimizationsInitialStateHandler, NoAnchorDfaOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
};
}
else
{
// Fallback for Z anchor or over 255 minterms
matchEnd = _findOpts is not null ?
FindEndPositionFallback<FindOptimizationsInitialStateHandler, DefaultNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData) :
FindEndPositionFallback<NoOptimizationsInitialStateHandler, DefaultNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData);
}
// If there wasn't a match, we're done.
if (matchEnd == NoMatchExists)
{
return SymbolicMatch.NoMatch;
}
// A match exists. If we don't need further details, e.g. because IsMatch was used (and thus we don't
// need the exact bounds of the match, captures, etc.), we're done.
if (mode == RegexRunnerMode.ExistenceRequired)
{
return SymbolicMatch.MatchExists;
}
// Phase 2:
// Match backwards through the input matching against the reverse of the pattern, looking for the earliest
// start position. That tells us the actual starting position of the match. We can skip this phase if we
// recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that
// exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from
// that last b until it finds the 4th a: aaabbbc.
int matchStart = 0;
Debug.Assert(matchEnd >= startat - 1);
switch (_optimizedReversalInfo.Kind)
{
case MatchReversalKind.MatchStart:
case MatchReversalKind.PartialFixedLength:
int initialLastStart = -1; // invalid sentinel value
int i = matchEnd;
CurrentState reversalStartState;
if (_optimizedReversalInfo.Kind is MatchReversalKind.MatchStart)
{
// No fixed-length knowledge. Start at the end of the match.
reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]);
}
else
{
// There's a fixed-length portion at the end of the match. Start just before it.
i -= _optimizedReversalInfo.FixedLength;
reversalStartState = new CurrentState(_optimizedReversalInfo.AdjustedStartState!);
// reversal may already be nullable here in the case of anchors
if (_containsAnyAnchor &&
_nullabilityArray[reversalStartState.DfaStateId] > 0 &&
DefaultNullabilityHandler.IsNullableAt<DfaStateHandler>(this, in reversalStartState, DefaultInputReader.GetPositionId(this, input, i)))
{
initialLastStart = i;
}
}
matchStart = matchEnd < startat ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch
{
// Call FindStartPosition with generic method arguments based on the presence of anchors. This is purely an optimization;
// the (true, true) case is functionally complete whereas the (false, false) case is the most optimized.
(true, true) => FindStartPosition<DefaultInputReader, DefaultNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
(true, false) => FindStartPosition<DefaultInputReader, NoAnchorsNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
(false, true) => FindStartPosition<NoZAnchorOptimizedInputReader, DefaultNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
(false, false) => FindStartPosition<NoZAnchorOptimizedInputReader, NoAnchorsNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
};
break;
case MatchReversalKind.FixedLength:
// The whole match is known to be of a fixed length, so we don't need to do any processing to find its beginning, just jump there.
matchStart = matchEnd - _optimizedReversalInfo.FixedLength;
break;
default:
Debug.Fail($"Unexpected reversal kind: {_optimizedReversalInfo.Kind}");
break;
}
// Phase 3:
// If there are no subcaptures (or if they're not needed), the matching process is done. For patterns with subcaptures
// (captures other than the top-level capture for the whole match), we need to do an additional pass to find their bounds.
// Continuing for the previous example, phase 3 will be executed for the characters inside the match, aaabbbc,
// and will find associate the one capture (b*) with it's match: bbb.
if (!HasSubcaptures || mode < RegexRunnerMode.FullMatchRequired)
{
return new SymbolicMatch(matchStart, matchEnd - matchStart);
}
else
{
Registers endRegisters = _containsAnyAnchor ?
FindSubcaptures<DefaultInputReader>(input, matchStart, matchEnd, perThreadData) :
FindSubcaptures<NoZAnchorOptimizedInputReader>(input, matchStart, matchEnd, perThreadData);
return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
}
}
/// <summary>
/// Streamlined version of <see cref="FindEndPositionFallback"/> that doesn't handle /z anchors or very large sets of minterms.
/// </summary>
private int FindEndPositionOptimized<TInitialStateHandler, TOptimizedNullabilityHandler>(
ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
where TInitialStateHandler : struct, IInitialStateHandler
where TOptimizedNullabilityHandler : struct, IDfaNoZAnchorOptimizedNullabilityHandler
{
// Initial state candidate.
var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]);
int endPos = NoMatchExists;
int lengthMinus1 = input.Length - 1;
while (true)
{
int innerLoopLength;
bool done;
if (currentState.NfaState is null)
{
const int DfaCharsPerTimeoutCheck = 100_000;
innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1;
done = FindEndPositionDeltasDFAOptimized<TInitialStateHandler, TOptimizedNullabilityHandler>(
input, innerLoopLength, mode, timeoutOccursAt, ref pos,
ref currentState.DfaStateId, ref endPos);
}
else
{
// NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here and it's not worth special-casing.
const int NfaCharsPerTimeoutCheck = 1_000;
innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length;
done = FindEndPositionDeltasNFA<DefaultNullabilityHandler>(
input, innerLoopLength, mode, ref pos,
ref currentState, ref endPos);
}
// If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
// there is no more input available, then the whole search is done.
if (done || pos >= input.Length)
{
break;
}
// The search did not finish, so we either failed to transition (which should only happen if we were in DFA mode and
// need to switch over to NFA mode) or ran out of input in the inner loop. Check if the inner loop still had more
// input available.
if (pos < innerLoopLength)
{
// Because there was still more input available, a failure to transition in DFA mode must be the cause
// of the early exit. Upgrade to NFA mode.
NfaMatchingState nfaState = perThreadData.NfaState;
nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
currentState = new CurrentState(nfaState);
}
// Check for a timeout before continuing.
if (_checkTimeout)
{
CheckTimeout(timeoutOccursAt);
}
}
return endPos;
}
/// <summary>Performs the initial Phase 1 match to find the end position of the match, or first final state if this is an isMatch call.</summary>
/// <param name="input">The input text.</param>
/// <param name="pos">The starting position in <paramref name="input"/>.</param>
/// <param name="timeoutOccursAt">The time at which timeout occurs, if timeouts are being checked.</param>
/// <param name="mode">The mode of execution based on the regex operation being performed.</param>
/// <param name="perThreadData">Per thread data reused between calls.</param>
/// <returns>
/// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists.
/// </returns>
private int FindEndPositionFallback<TInitialStateHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
where TInitialStateHandler : struct, IInitialStateHandler
where TNullabilityHandler : struct, INullabilityHandler
{
var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]);
int endPos = NoMatchExists;
while (true)
{
// Now run the DFA or NFA traversal from the current point using the current state. If timeouts are being checked,
// we need to pop out of the inner loop every now and then to do the timeout check in this outer loop. Note that
// the timeout exists not to provide perfect guarantees around execution time but rather as a mitigation against
// catastrophic backtracking. Catastrophic backtracking is not an issue for the NonBacktracking engine, but we
// still check the timeout now and again to provide some semblance of the behavior a developer experiences with
// the backtracking engines. We can, however, choose a large number here, since it's not actually needed for security.
// The fallback function has lower limits due to worse performance from edge cases
int innerLoopLength;
bool done;
if (currentState.NfaState is null)
{
const int DfaCharsPerTimeoutCheck = 25_000;
innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length;
done = FindEndPositionDeltasDFA<TInitialStateHandler, TNullabilityHandler>(
input, innerLoopLength, mode, ref pos, ref currentState, ref endPos);
}
else
{
// NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here.
const int NfaCharsPerTimeoutCheck = 1_000;
innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length;
done = FindEndPositionDeltasNFA<TNullabilityHandler>(
input, innerLoopLength, mode, ref pos, ref currentState, ref endPos);
}
// If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
// there is no more input available, then the whole search is done.
if (done || pos >= input.Length)
{
break;
}
// The search did not finish, so we either failed to transition (which should only happen if we were in DFA mode and
// need to switch over to NFA mode) or ran out of input in the inner loop. Check if the inner loop still had more
// input available.
if (pos < innerLoopLength)
{
// Because there was still more input available, a failure to transition in DFA mode must be the cause
// of the early exit. Upgrade to NFA mode.
NfaMatchingState nfaState = perThreadData.NfaState;
nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
currentState = new CurrentState(nfaState);
}
// Check for a timeout before continuing.
if (_checkTimeout)
{
CheckTimeout(timeoutOccursAt);
}
}
return endPos;
}
/// <summary>
/// This version of <see cref="FindEndPositionDeltasDFA"/> uses a different set of interfaces,
/// which don't check for many inner loop edge cases, e.g. input end or '\n'.
/// All edge cases are handled before entering the loop.
/// </summary>
private bool FindEndPositionDeltasDFAOptimized<TInitialStateHandler, TOptimizedNullabilityHandler>(
ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef)
where TInitialStateHandler : struct, IInitialStateHandler
where TOptimizedNullabilityHandler : struct, IDfaNoZAnchorOptimizedNullabilityHandler
{
int pos = posRef;
// Initial check for input end lifted out of the subsequent hot-path loop.
if (pos == input.Length)
{
if (_stateArray[currentStateIdRef]!.IsNullableFor(_positionKinds[0]))
{
// The end position kind was nullable.
endPosRef = pos;
}
return true;
}
// To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
int currStateId = currentStateIdRef;
int endPos = endPosRef;
byte[] mintermsLookup = _mintermClassifier.ByteLookup!;
int deadStateId = _deadStateId;
int initialStateId = _initialStateId;
bool result = true;
while (currStateId != deadStateId)
{
if (TInitialStateHandler.IsOptimized && currStateId == initialStateId)
{
TInitialStateHandler.TryFindNextStartingPosition(this, input, ref currStateId, ref pos, mintermsLookup);
if (pos == input.Length)
{
// Patterns such as ^$ can be nullable right away.
if (_stateArray[currStateId]!.IsNullableFor(_positionKinds[0]))
{
// The end position kind was nullable.
endPos = pos;
}
currStateId = deadStateId;
break;
}
}
// Get the next character.
char c = input[pos];
// If the state is nullable for the next character, we found a potential end state.
if (TOptimizedNullabilityHandler.IsNullable(this, _nullabilityArray[currStateId], c, mintermsLookup))
{
endPos = pos;
if (mode == RegexRunnerMode.ExistenceRequired)
{
// A match is known to exist. If that's all we need to know, we're done.
break;
}
}
// If there is more input available try to transition with the next character.
// Note: the order here is important so the transition itself gets taken
if (!DfaStateHandler.TryTakeTransition(this, ref currStateId, GetMintermId(mintermsLookup, c), timeoutOccursAt) ||
pos >= lengthMinus1)
{
if (pos + 1 < input.Length)
{
result = false;
break;
}
pos++;
// One off check for the final position. This is just to move it out of the hot loop.
if (_stateFlagsArray[currStateId].IsNullable() ||
_stateArray[currStateId]!.IsNullableFor(_positionKinds[0]))
{
// The end position (-1) was nullable.
endPos = pos;
}
break;
}
// We successfully transitioned, so update our current input index to match.
pos++;
}
// Write back the local copies of the ref values.
posRef = pos;
endPosRef = endPos;
currentStateIdRef = currStateId;
return result;
}
/// <summary>
/// Workhorse inner loop for <see cref="FindEndPositionFallback{TFindOptimizationsHandler,TNullabilityHandler}"/>. Consumes the <paramref name="input"/> character by character,
/// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
/// lazily building out the graph as needed.
/// </summary>
/// <returns>
/// true if all input has been explored and there's no further work to be done; false if there's more input to explore and/or
/// we need to transition from DFA mode to NFA mode.
/// </returns>
private bool FindEndPositionDeltasDFA<TInitialStateHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
ref int posRef, ref CurrentState stateRef, ref int endPosRef)
where TInitialStateHandler : struct, IInitialStateHandler
where TNullabilityHandler : struct, INullabilityHandler
{
// To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
int pos = posRef;
int endPos = endPosRef;
CurrentState state = stateRef;
int deadStateId = _deadStateId;
int initialStateId = _initialStateId;
// Loop through each character in the input, transitioning from state to state for each.
bool result = true;
while (true)
{
if (state.DfaStateId == deadStateId ||
(state.DfaStateId == initialStateId && !TInitialStateHandler.TryFindNextStartingPosition(this, input, ref state.DfaStateId, ref pos, null!)))
{
break;
}
int positionId = DefaultInputReader.GetPositionId(this, input, pos);
// If the state is nullable for the next character, meaning it accepts the empty string,
// we found a potential end state.
if (TNullabilityHandler.IsNullableAt<DfaStateHandler>(this, in state, positionId))
{
endPos = pos;
if (mode == RegexRunnerMode.ExistenceRequired)
{
// A match is known to exist. If that's all we need to know, we're done.
break;
}
}
// If there is more input available try to transition with the next character.
if (pos >= length || !DfaStateHandler.TryTakeTransition(this, ref state.DfaStateId, positionId))
{
result = false;
break;
}
// We successfully transitioned, so update our current input index to match.
pos++;
}
// Write back the local copies of the ref values.
posRef = pos;
endPosRef = endPos;
stateRef = state;
return result;
}
/// <summary>
/// Workhorse inner loop for <see cref="FindEndPositionFallback{TFindOptimizationsHandler,TNullabilityHandler}"/>. Consumes the <paramref name="input"/> character by character,
/// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
/// lazily building out the graph as needed.
/// </summary>
/// <returns>
/// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch.
/// 0 if iteration completed because we reached an initial state.
/// A negative value if iteration completed because we ran out of input or we failed to transition.
/// </returns>
private bool FindEndPositionDeltasNFA<TNullabilityHandler>(
ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
ref int posRef, ref CurrentState state, ref int endPosRef)
where TNullabilityHandler : struct, INullabilityHandler
{
// To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
int pos = posRef;
int endPos = endPosRef;
// Loop through each character in the input, transitioning from state to state for each.
bool result = true;
while (state.NfaState!.NfaStateSet.Count != 0) // Dead end here means the set is empty
{
int positionId = DefaultInputReader.GetPositionId(this, input, pos);
// If the state is nullable for the next character, meaning it accepts the empty string,
// we found a potential end state.
if (TNullabilityHandler.IsNullableAt<NfaStateHandler>(this, in state, positionId))
{
endPos = pos;
if (mode == RegexRunnerMode.ExistenceRequired)
{
// A match is known to exist. If that's all we need to know, we're done.
break;
}
}
// If there is more input available try to transition with the next character.
if (pos >= length || !NfaStateHandler.TryTakeTransition(this, ref state, positionId))
{
break;
}
// We successfully transitioned, so update our current input index to match.
pos++;
}
// Write back the local copies of the ref values.
posRef = pos;
endPosRef = endPos;
return result;
}
/// <summary>
/// Phase 2 of matching. From a found ending position, walk in reverse through the input using the reverse pattern to find the
/// start position of match.
/// </summary>
/// <remarks>
/// The start position is known to exist; this function just needs to determine exactly what it is.
/// We need to find the earliest (lowest index) starting position that's not earlier than <paramref name="matchStartBoundary"/>.
/// </remarks>
/// <param name="startState">State to start reversal from</param>
/// <param name="initialLastStart">Either valid match start location or -1</param>
/// <param name="input">The input text.</param>
/// <param name="i">The ending position to walk backwards from. <paramref name="i"/> points one past the last character of the match.</param>
/// <param name="matchStartBoundary">The initial starting location discovered in phase 1, a point we must not walk earlier than.</param>
/// <param name="perThreadData">Per thread data reused between calls.</param>
/// <returns>The found starting position for the match.</returns>
private int FindStartPosition<TInputReader, TNullabilityHandler>(CurrentState startState, int initialLastStart, ReadOnlySpan<char> input, int i, int matchStartBoundary, PerThreadData perThreadData)
where TInputReader : struct, IInputReader
where TNullabilityHandler : struct, INullabilityHandler
{
Debug.Assert(i >= 0, $"{nameof(i)} == {i}");
Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary <= input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}");
Debug.Assert(i >= matchStartBoundary, $"Expected {i} >= {matchStartBoundary}.");
CurrentState currentState = startState;
int lastStart = initialLastStart;
// Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary.
while (true)
{
// Run the DFA or NFA traversal backwards from the current point using the current state.
bool done = currentState.NfaState is not null ?
FindStartPositionDeltasNFA<TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
FindStartPositionDeltasDFA<TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart);
// If we found the starting position, we're done.
if (done)
{
break;
}
// We didn't find the starting position but we did exit out of the backwards traversal. That should only happen
// if we were unable to transition, which should only happen if we were in DFA mode and exceeded our graph size.
// Upgrade to NFA mode and continue.
Debug.Assert(i >= matchStartBoundary);
NfaMatchingState nfaState = perThreadData.NfaState;
nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
currentState = new CurrentState(nfaState);
}
Debug.Assert(lastStart != -1, "We expected to find a starting position but didn't.");
return lastStart;
}
/// <summary>
/// Workhorse inner loop for <see cref="FindStartPosition"/>. Consumes the <paramref name="input"/> character by character in reverse,
/// starting at <paramref name="i"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
/// lazily building out the graph as needed.
/// </summary>
private bool FindStartPositionDeltasDFA<TInputReader, TNullabilityHandler>(
ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState stateRef, ref int lastStart)
where TInputReader : struct, IInputReader
where TNullabilityHandler : struct, INullabilityHandler
{
// To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
int pos = i;
CurrentState state = stateRef;
// Loop backwards through each character in the input, transitioning from state to state for each.
bool result = true;
while (true)
{
int positionId = TInputReader.GetPositionId(this, input, pos - 1);
// If the state accepts the empty string, we found a valid starting position. Record it and keep going,
// since we're looking for the earliest one to occur within bounds.
if (_nullabilityArray[state.DfaStateId] != 0 &&
TNullabilityHandler.IsNullableAt<DfaStateHandler>(this, in state, positionId))
{
lastStart = pos;
}
// If we are past the start threshold or if the state is a dead end, bail; we should have already
// found a valid starting location.
if (pos <= startThreshold || state.DfaStateId == _deadStateId)
{
Debug.Assert(lastStart != -1);
break;
}
// Try to transition with the next character, the one before the current position.
if (!DfaStateHandler.TryTakeTransition(this, ref state.DfaStateId, positionId))
{
// Return false to indicate the search didn't finish.
result = false;
break;
}
// Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input.
pos--;
}
// Write back the local copies of the ref values.
stateRef = state;
i = pos;
return result;
}
private bool FindStartPositionDeltasNFA<TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
where TInputReader : struct, IInputReader
where TNullabilityHandler : struct, INullabilityHandler
{
// To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
int pos = i;
// Loop backwards through each character in the input, transitioning from state to state for each.
bool result = true;
while (true)
{
int positionId = TInputReader.GetPositionId(this, input, pos - 1);
// If the state accepts the empty string, we found a valid starting position. Record it and keep going,
// since we're looking for the earliest one to occur within bounds.
if (TNullabilityHandler.IsNullableAt<NfaStateHandler>(this, in state, positionId))
{
lastStart = pos;
}
// If we are past the start threshold or if the state is a dead end, bail; we should have already
// found a valid starting location.
if (pos <= startThreshold || state.DfaStateId == _deadStateId)
{
Debug.Assert(lastStart != -1);
break;
}
// Try to transition with the next character, the one before the current position.
if (!NfaStateHandler.TryTakeTransition(this, ref state, positionId))
{
// Return false to indicate the search didn't finish.
result = false;
break;
}
// Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input.
pos--;
}
// Write back the local copies of the ref values.
i = pos;
return result;
}
/// <summary>Run the pattern on a match to record the capture starts and ends.</summary>
/// <param name="input">input span</param>
/// <param name="i">inclusive start position</param>
/// <param name="iEnd">exclusive end position</param>
/// <param name="perThreadData">Per thread data reused between calls.</param>
/// <returns>the final register values, which indicate capture starts and ends</returns>
private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i, int iEnd, PerThreadData perThreadData)
where TInputReader : struct, IInputReader
{
// Pick the correct start state based on previous character kind.
MatchingState<TSet> initialState = _initialStates[GetCharKind(input, i - 1)];
Registers initialRegisters = perThreadData.InitialRegisters;
// Initialize registers with -1, which means "not seen yet"
Array.Fill(initialRegisters.CaptureStarts, -1);
Array.Fill(initialRegisters.CaptureEnds, -1);
// Use two maps from state IDs to register values for the current and next set of states.
// Note that these maps use insertion order, which is used to maintain priorities between states in a way
// that matches the order the backtracking engines visit paths.
Debug.Assert(perThreadData.Current is not null && perThreadData.Next is not null);
SparseIntMap<Registers> current = perThreadData.Current, next = perThreadData.Next;
current.Clear();
next.Clear();
ForEachNfaState(initialState.Node, initialState.PrevCharKind, (current, initialRegisters),
static (int nfaId, (SparseIntMap<Registers> Current, Registers InitialRegisters) args) =>
args.Current.Add(nfaId, args.InitialRegisters.Clone()));
while ((uint)i < (uint)iEnd)
{
Debug.Assert(next.Count == 0);
// i is guaranteed to be within bounds, so the position ID is a minterm ID
int mintermId = TInputReader.GetPositionId(this, input, i);
foreach ((int sourceId, Registers sourceRegisters) in current.Values)
{
// Get or create the transitions
int offset = DeltaOffset(sourceId, mintermId);
(int, DerivativeEffect[])[] transitions = _capturingNfaDelta[offset] ??
CreateNewCapturingTransition(sourceId, mintermId, offset);
// Take the transitions in their prioritized order
for (int j = 0; j < transitions.Length; ++j)
{
(int targetStateId, DerivativeEffect[] effects) = transitions[j];
// Try to add the state and handle the case where it didn't exist before. If the state already
// exists, then the transition can be safely ignored, as the existing state was generated by a
// higher priority transition.
if (next.Add(targetStateId, out int index))
{
// Avoid copying the registers on the last transition from this state, reusing the registers instead
Registers newRegisters = j != transitions.Length - 1 ? sourceRegisters.Clone() : sourceRegisters;
newRegisters.ApplyEffects(effects, i);
next.Update(index, targetStateId, newRegisters);
int coreStateId = GetCoreStateId(targetStateId);
StateFlags flags = _stateFlagsArray[coreStateId];
Debug.Assert(coreStateId != _deadStateId);
if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1))))
{
// No lower priority transitions from this or other source states are taken because the
// backtracking engines would return the match ending here.
goto BreakNullable;
}
}
}
}
BreakNullable:
// Swap the state sets and prepare for the next character
SparseIntMap<Registers> tmp = current;
current = next;
next = tmp;
next.Clear();
i++;
}
Debug.Assert(current.Count > 0);
foreach ((int endStateId, Registers endRegisters) in current.Values)
{
MatchingState<TSet> endState = GetState(GetCoreStateId(endStateId));
if (endState.IsNullableFor(GetCharKind(input, iEnd)))
{
// Apply effects for finishing at the stored end state
endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos),
CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd));
return endRegisters;
}
}
Debug.Fail("No nullable state found in the set of end states");
return default;
}
/// <summary>Look up the min term ID for the character.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int GetMintermId(byte[] mintermLookup, char c) =>
c < (uint)mintermLookup.Length ?
mintermLookup[c] :
0;
/// <summary>Stores additional data for tracking capture start and end positions.</summary>
/// <remarks>The NFA simulation based third phase has one of these for each current state in the current set of live states.</remarks>
internal struct Registers(int[] captureStarts, int[] captureEnds)
{
public int[] CaptureStarts { get; set; } = captureStarts;
public int[] CaptureEnds { get; set; } = captureEnds;
/// <summary>
/// Applies a list of effects in order to these registers at the provided input position. The order of effects
/// should not matter though, as multiple effects to the same capture start or end do not arise.
/// </summary>
/// <param name="effects">list of effects to be applied</param>
/// <param name="pos">the current input position to record</param>
public void ApplyEffects(DerivativeEffect[] effects, int pos)
{
foreach (DerivativeEffect effect in effects)
{
ApplyEffect(effect, pos);
}
}
/// <summary>
/// Apply a single effect to these registers at the provided input position.
/// </summary>
/// <param name="effect">the effecto to be applied</param>
/// <param name="pos">the current input position to record</param>
public void ApplyEffect(DerivativeEffect effect, int pos)
{
switch (effect.Kind)
{
case DerivativeEffectKind.CaptureStart:
CaptureStarts[effect.CaptureNumber] = pos;
break;
case DerivativeEffectKind.CaptureEnd:
CaptureEnds[effect.CaptureNumber] = pos;
break;
}
}
/// <summary>
/// Make a copy of this set of registers.
/// </summary>
/// <returns>Registers pointing to copies of this set of registers</returns>
public Registers Clone() => new Registers((int[])CaptureStarts.Clone(), (int[])CaptureEnds.Clone());
/// <summary>
/// Copy register values from another set of registers, possibly allocating new arrays if they were not yet allocated.
/// </summary>
/// <param name="other">the registers to copy from</param>
public void Assign(Registers other)
{
if (CaptureStarts is not null && CaptureEnds is not null)
{
Debug.Assert(CaptureStarts.Length == other.CaptureStarts.Length);
Debug.Assert(CaptureEnds.Length == other.CaptureEnds.Length);
Array.Copy(other.CaptureStarts, CaptureStarts, CaptureStarts.Length);
Array.Copy(other.CaptureEnds, CaptureEnds, CaptureEnds.Length);
}
else
{
CaptureStarts = (int[])other.CaptureStarts.Clone();
CaptureEnds = (int[])other.CaptureEnds.Clone();
}
}
}
/// <summary>
/// Per thread data to be held by the regex runner and passed into every call to FindMatch. This is used to
/// avoid repeated memory allocation.
/// </summary>
internal sealed class PerThreadData
{
public readonly NfaMatchingState NfaState;
/// <summary>Maps used for the capturing third phase.</summary>
public readonly SparseIntMap<Registers>? Current, Next;
/// <summary>Registers used for the capturing third phase.</summary>
public readonly Registers InitialRegisters;
public PerThreadData(int capsize)
{
NfaState = new NfaMatchingState();
// Only create data used for capturing mode if there are subcaptures
if (capsize > 1)
{
Current = new SparseIntMap<Registers>();
Next = new SparseIntMap<Registers>();
InitialRegisters = new Registers(new int[capsize], new int[capsize]);
}
}
}
/// <summary>Stores the state that represents a current state in NFA mode.</summary>
/// <remarks>The entire state is composed of a list of individual states.</remarks>
/// <remarks>New instances should only be created once per runner.</remarks>
internal sealed class NfaMatchingState
{
/// <summary>Ordered set used to store the current NFA states.</summary>
/// <remarks>The value is unused. The type is used purely for its keys.</remarks>
public SparseIntMap<int> NfaStateSet = new();
/// <summary>Scratch set to swap with <see cref="NfaStateSet"/> on each transition.</summary>
/// <remarks>
/// On each transition, <see cref="NfaStateSetScratch"/> is cleared and filled with the next
/// states computed from the current states in <see cref="NfaStateSet"/>, and then the sets
/// are swapped so the scratch becomes the current and the current becomes the scratch.
/// </remarks>
public SparseIntMap<int> NfaStateSetScratch = new();
/// <summary>Resets this NFA state to represent the supplied DFA state.</summary>
/// <param name="matcher"></param>
/// <param name="dfaMatchingState">The DFA state to use to initialize the NFA state.</param>
public void InitializeFrom(SymbolicRegexMatcher<TSet> matcher, MatchingState<TSet> dfaMatchingState)
{
NfaStateSet.Clear();
// If the DFA state is a union of multiple DFA states, loop through all of them
// adding an NFA state for each.
matcher.ForEachNfaState(dfaMatchingState.Node, dfaMatchingState.PrevCharKind, NfaStateSet,
static (int nfaId, SparseIntMap<int> nfaStateSet) => nfaStateSet.Add(nfaId, out _));
}
}
/// <summary>Represents a current state in a DFA or NFA graph walk while processing a regular expression.</summary>
/// <remarks>This is a discriminated union between a DFA state and an NFA state. One and only one will be non-null.</remarks>
private struct CurrentState
{
/// <summary>Initializes the state as a DFA state.</summary>
public CurrentState(MatchingState<TSet> dfaState)
{
DfaStateId = dfaState.Id;
NfaState = null;
}
/// <summary>Initializes the state as an NFA state.</summary>
public CurrentState(NfaMatchingState nfaState)
{
DfaStateId = -1;
NfaState = nfaState;
}
/// <summary>The DFA state.</summary>
public int DfaStateId;
/// <summary>The NFA state.</summary>
public NfaMatchingState? NfaState;
}
/// <summary>Represents a set of routines for operating over a <see cref="CurrentState"/>.</summary>
private interface IStateHandler
{
public static abstract bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind);
public static abstract StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state);
}
/// <summary>An <see cref="IStateHandler"/> for operating over <see cref="CurrentState"/> instances configured as DFA states.</summary>
private readonly struct DfaStateHandler : IStateHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind) =>
matcher._nullabilityArray[state.DfaStateId] > 0 &&
((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0;
/// <summary>Take the transition to the next DFA state.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref int dfaStateId, int mintermId, long timeoutOccursAt = 0)
{
Debug.Assert(dfaStateId > 0, $"Expected non-zero {nameof(dfaStateId)}.");
// Use the mintermId for the character being read to look up which state to transition to.
// If that state has already been materialized, move to it, and we're done. If that state
// hasn't been materialized, try to create it; if we can, move to it, and we're done.
int dfaOffset = matcher.DeltaOffset(dfaStateId, mintermId);
int nextStateId = matcher._dfaDelta[dfaOffset];
if (nextStateId > 0)
{
// There was an existing DFA transition to some state. Move to it and
// return that we're still operating as a DFA and can keep going.
dfaStateId = nextStateId;
return true;
}
if (matcher.TryCreateNewTransition(matcher.GetState(dfaStateId), mintermId, dfaOffset, checkThreshold: true, timeoutOccursAt, out MatchingState<TSet>? nextState))
{
// We were able to create a new DFA transition to some state. Move to it and
// return that we're still operating as a DFA and can keep going.
dfaStateId = nextState.Id;
return true;
}
return false;
}
/// <summary>
/// Gets context independent state information:
/// - whether this is an initial state
/// - whether this is a dead-end state, meaning there are no transitions possible out of the state
/// - whether this state is unconditionally nullable
/// - whether this state may be contextually nullable
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state) =>
matcher._stateFlagsArray[state.DfaStateId];
}
/// <summary>An <see cref="IStateHandler"/> for operating over <see cref="CurrentState"/> instances configured as NFA states.</summary>
private readonly struct NfaStateHandler : IStateHandler
{
/// <summary>Check if any underlying core state starts with a line anchor.</summary>
public static bool StartsWithLineAnchor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
{
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).StartsWithLineAnchor)
{
return true;
}
}
return false;
}
/// <summary>Check if any underlying core state is nullable in the context of the next character kind.</summary>
public static bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind)
{
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
if (matcher.GetState(matcher.GetCoreStateId(nfaState.Key)).IsNullableFor(nextCharKind))
{
return true;
}
}
return false;
}
/// <summary>Take the transition to the next NFA state.</summary>
public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId)
{
Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}.");
Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}.");
NfaMatchingState nfaState = state.NfaState!;
// Grab the sets, swapping the current active states set with the scratch set.
SparseIntMap<int> nextStates = nfaState.NfaStateSetScratch;
SparseIntMap<int> sourceStates = nfaState.NfaStateSet;
nfaState.NfaStateSet = nextStates;
nfaState.NfaStateSetScratch = sourceStates;
// Compute the set of all unique next states from the current source states and the mintermId.
nextStates.Clear();
if (sourceStates.Count == 1)
{
// We have a single source state. We know its next states are already deduped,
// so we can just add them directly to the destination states list.
foreach (int nextState in GetNextStates(sourceStates.Values[0].Key, mintermId, matcher))
{
nextStates.Add(nextState, out _);
// Nothing is required for backtracking simulation here, since there's just one state so the
// transition itself already handles it.
}
}
else
{
// We have multiple source states, so we need to potentially dedup across each of
// their next states. For each source state, get its next states, adding each into
// our set (which exists purely for deduping purposes), and if we successfully added
// to the set, then add the known-unique state to the destination list.
uint nextCharKind = matcher.GetPositionKind(mintermId);
foreach (ref KeyValuePair<int, int> sourceState in CollectionsMarshal.AsSpan(sourceStates.Values))
{
foreach (int nextState in GetNextStates(sourceState.Key, mintermId, matcher))
{
nextStates.Add(nextState, out _);
}
// To simulate backtracking, if a source state is nullable then no further transitions are taken
// as the backtracking engines would prefer the match ending here.
int coreStateId = matcher.GetCoreStateId(sourceState.Key);
StateFlags flags = matcher._stateFlagsArray[coreStateId];
if (flags.SimulatesBacktracking() &&
(flags.IsNullable() || (flags.CanBeNullable() && matcher.GetState(coreStateId).IsNullableFor(nextCharKind))))
{
break;
}
}
}
return true;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher<TSet> matcher)
{
// Calculate the offset into the NFA transition table.
int nfaOffset = matcher.DeltaOffset(sourceState, mintermId);
// Get the next NFA state.
return matcher._nfaDelta[nfaOffset] ?? matcher.CreateNewNfaTransition(sourceState, mintermId, nfaOffset);
}
}
/// <summary>
/// Gets context independent state information:
/// - whether this is an initial state
/// - whether this is a dead-end state, meaning there are no transitions possible out of the state
/// - whether this state is unconditionally nullable
/// - whether this state may be contextually nullable
/// </summary>
/// <remarks>
/// In NFA mode:
/// - an empty set of states means that it is a dead end
/// - no set of states qualifies as an initial state. This could be made more accurate, but with that the
/// matching logic would need to be updated to handle the fact that <see cref="FindOptimizationsInitialStateHandler"/>
/// can transition back to a DFA state.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
{
// Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then
// masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if
// they are true for any state in the set; SimulatesBacktracking is true for all the states if
// it is true for any state (since it is a phase-wide property); and all other flags are masked out.
StateFlags flags = 0;
foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)];
}
return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag);
}
#if DEBUG
/// <summary>Undo a previous call to <see cref="TryTakeTransition"/>.</summary>
public static void UndoTransition(ref CurrentState state)
{
Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}.");
Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}.");
NfaMatchingState nfaState = state.NfaState!;
// Swap the current active states set with the scratch set to undo a previous transition.
SparseIntMap<int> nextStates = nfaState.NfaStateSet;
SparseIntMap<int> sourceStates = nfaState.NfaStateSetScratch;
nfaState.NfaStateSet = sourceStates;
nfaState.NfaStateSetScratch = nextStates;
// Sanity check: if there are any next states, then there must have been some source states.
Debug.Assert(nextStates.Count == 0 || sourceStates.Count > 0);
}
#endif
}
/// <summary>
/// Interface for mapping positions in the input to position IDs, which capture all the information necessary to
/// both take transitions and decide nullability.
/// </summary>
private interface IInputReader
{
/// <summary>Gets the position ID for the specified character in the input.</summary>
/// <remarks>
/// For positions of valid characters that are handled normally, these IDs coincide with minterm IDs (i.e. indices to <see cref="_minterms"/>).
/// Positions outside the bounds of the input are mapped to -1. Optionally, an end-of-line as the very last character in the input may be
/// mapped to _minterms.Length for supporting the \Z anchor. The <paramref name="input"/> and <paramref name="pos"/> parameters are specified
/// separately, rather than <code>input[pos]</code> being passed in as a single <see cref="char"/>, because some inputs need to act differently
/// based on the position itself.
/// </remarks>
public static abstract int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos);
}
/// <summary>Provides an input reader that includes full handling of an \n as the last character of input for the \Z anchor.</summary>
private readonly struct DefaultInputReader : IInputReader
{
/// <summary>
/// Gets the minterm ID of the specified character, -1 if the position isn't within the input, or <see cref="_minterms"/>.Length
/// for a \n at the very end of the input.
/// </summary>
public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos)
{
if ((uint)pos < (uint)input.Length)
{
// Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
int c = input[pos];
return c == '\n' && pos == input.Length - 1 ?
matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input
matcher._mintermClassifier.GetMintermID(c);
}
return -1;
}
}
/// <summary>Provides an optimized input reader that doesn't provide special-handling of \n at the end of the input for the \Z anchor.</summary>
private readonly struct NoZAnchorOptimizedInputReader : IInputReader
{
/// <summary>Gets the minterm ID of the specified character, or -1 if the position isn't within the input.</summary>
public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos) =>
(uint)pos < (uint)input.Length ?
matcher._mintermClassifier.GetMintermID(input[pos]) :
-1;
}
/// <summary>Represents a handler used to determine the next possible matching position from an initial state.</summary>
private interface IInitialStateHandler
{
/// <summary>Gets whether the handler performs any meaningful operation. If false, <see cref="TryFindNextStartingPosition"/> always returns true.</summary>
/// <remarks>
/// This should be implemented to always return a constant true or false. The consumer will inline it and, if this is false, can dead-code eliminate
/// anything guarded by the condition.
/// </remarks>
public static abstract bool IsOptimized { get; }
/// <summary>Gets the next viable starting position.</summary>
/// <returns>true if a possible match location is found; false if no match is possible anywhere in the remaining input.</returns>
/// <remarks>This may be used if <see cref="IsOptimized"/> is false but it will then always return true indicating that the current position may be viable.</remarks>
public static abstract bool TryFindNextStartingPosition(
SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, byte[]? lookup);
}
/// <summary>Provides an initial state handler for when there are no initial state optimizations to apply.</summary>
private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler
{
/// <summary>Returns false.</summary>
public static bool IsOptimized => false;
/// <summary>Returns true. No optimizations are known to be able to skip states, thus every position is a viable starting position.</summary>
public static bool TryFindNextStartingPosition(
SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, byte[]? lookup) =>
true;
}
/// <summary>Provides a handler that uses the matcher's <see cref="RegexFindOptimizations"/> to optimize searching for the next viable starting state.</summary>
private readonly struct FindOptimizationsInitialStateHandler : IInitialStateHandler
{
/// <summary>Returns true.</summary>
public static bool IsOptimized => true;
/// <summary>Gets the next viable starting position.</summary>
/// <returns>true if a viable starting position was found; false if no further possible match exists.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryFindNextStartingPosition(
SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, byte[]? lookup)
{
Debug.Assert(matcher._findOpts is not null);
// Find the first position that matches with some likely character.
if (matcher._findOpts.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
{
// Update the starting state based on where TryFindNextStartingPosition moved us to.
// As with the initial starting state, if it's a dead end, no match exists.
currentStateId = matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)].Id;
return true;
}
// No match exists
Debug.Assert(pos == input.Length);
currentStateId = matcher._deadStateId;
return false;
}
}
/// <summary>Provides a handler that uses the matcher's <see cref="RegexFindOptimizations"/> to optimize searching for the next viable starting state.</summary>
/// <remarks>This implementation works only when there are no /Z anchors in the pattern.</remarks>
private readonly struct NoZAnchorFindOptimizationsInitialStateHandler : IInitialStateHandler
{
/// <summary>Returns true.</summary>
public static bool IsOptimized => true;
/// <summary>Gets the next viable starting position.</summary>
/// <returns>true if a viable starting position was found; false if no further possible match exists.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryFindNextStartingPosition(
SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, byte[]? lookup)
{
Debug.Assert(matcher._findOpts is not null);
Debug.Assert(lookup is not null, $"{nameof(NoZAnchorFindOptimizationsInitialStateHandler)} must only be used with call sites that pass non-null {nameof(lookup)}.");
if (matcher._findOpts.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
{
// Update the starting state based on where TryFindNextStartingPosition moved us to.
// This is an optimized version of the update in FindOptimizationsInitialStateHandler that doesn't need to consider the possibility of /Z anchors.
currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input[pos - 1]) + 1]].Id;
return true;
}
// No match exists
Debug.Assert(pos == input.Length);
currentStateId = matcher._deadStateId;
return false;
}
}
/// <summary>Provides a handler that uses the matcher's <see cref="RegexFindOptimizations"/> to optimize searching for the next viable starting state.</summary>
/// <remarks>This implementation works only when there are no anchors in the pattern.</remarks>
private readonly struct NoAnchorsFindOptimizationsInitialStateHandler : IInitialStateHandler
{
/// <summary>Returns true.</summary>
public static bool IsOptimized => true;
/// <summary>Gets the next viable starting position.</summary>
/// <returns>true if a viable starting position was found; false if no further possible match exists.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryFindNextStartingPosition(
SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, byte[]? lookup)
{
Debug.Assert(!matcher._containsAnyAnchor);
Debug.Assert(matcher._findOpts is not null);
Debug.Assert(currentStateId == matcher._initialStateId, "There are no anchors, so the current state should be the sole initial state.");
if (matcher._findOpts.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
{
// There are no anchors, so there's only one starting state, so we don't need to update currentStateId that's already the starting state.
return true;
}
// No match exists
Debug.Assert(pos == input.Length);
currentStateId = matcher._deadStateId;
return false;
}
}
/// <summary>Represents a handler for evaluating nullability of states.</summary>
private interface INullabilityHandler
{
/// <summary>Gets whether the specified position is nullable.</summary>
public static abstract bool IsNullableAt<TStateHandler>(
SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId)
where TStateHandler : struct, IStateHandler;
}
/// <summary>Nullability handler that will work for any pattern.</summary>
private readonly struct DefaultNullabilityHandler : INullabilityHandler
{
/// <summary>Gets whether the specified position is nullable.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId)
where TStateHandler : struct, IStateHandler
{
StateFlags flags = TStateHandler.GetStateFlags(matcher, in state);
return
flags.IsNullable() ||
(flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
}
}
/// <summary>Nullability handler for patterns without any anchors.</summary>
private readonly struct NoAnchorsNullabilityHandler : INullabilityHandler
{
/// <summary>Gets whether the specified position is nullable.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId)
where TStateHandler : struct, IStateHandler
{
Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor);
return TStateHandler.GetStateFlags(matcher, in state).IsNullable();
}
}
/// <summary>Represents a handler for evaluating nullability of states and for use in DFAs for patterns that do not contain \Z anchors.</summary>
private interface IDfaNoZAnchorOptimizedNullabilityHandler
{
/// <summary>Gets whether the specified position is nullable.</summary>
public static abstract bool IsNullable(SymbolicRegexMatcher<TSet> matcher, byte stateNullability, char c, byte[] lookup);
}
/// <summary>Optimized nullability handler that works regardless of what additional anchors may exist in a pattern.</summary>
private readonly struct DefaultDfaNoZAnchorOptimizedNullabilityHandler : IDfaNoZAnchorOptimizedNullabilityHandler
{
/// <summary>Gets whether the specified position is nullable.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher, byte stateNullability, char c, byte[] lookup) =>
stateNullability != 0 &&
matcher.IsNullableWithContext(stateNullability, c < (uint)lookup.Length ? lookup[c] : 0);
}
/// <summary>Optimized nullability handler for when a pattern has no anchors at all.</summary>
private readonly struct NoAnchorDfaOptimizedNullabilityHandler : IDfaNoZAnchorOptimizedNullabilityHandler
{
/// <summary>Gets whether the specified position is nullable.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher, byte stateNullability, char c, byte[] lookup)
{
Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor);
return stateNullability != 0;
}
}
}
}
|