SymbolicRegexMatcher.Automata.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Threading;
 
namespace System.Text.RegularExpressions.Symbolic
{
    internal sealed partial class SymbolicRegexMatcher<TSet>
    {
        /// <summary>
        /// Initial capacity for DFA related arrays.
        /// </summary>
        private const int InitialDfaStateCapacity = 1024;
 
        /// <summary>
        /// Minimum capacity for NFA related arrays when the matcher first enters NFA mode. The arrays start out empty,
        /// but are resized to this capacity upon first use.
        /// </summary>
        private const int InitialNfaStateCapacity = 64;
 
        /// <summary>
        /// Cache for the states that have been created. Each state is uniquely identified by its associated
        /// <see cref="SymbolicRegexNode{TSet}"/> and the kind of the previous character.
        /// </summary>
        private readonly Dictionary<(SymbolicRegexNode<TSet> Node, uint PrevCharKind), MatchingState<TSet>> _stateCache = [];
 
        /// <summary>
        /// Maps state ids to states, initial capacity is given by <see cref="InitialDfaStateCapacity"/>.
        /// Each time more states are needed the length is doubled.
        /// The first valid state is at index 1.
        /// </summary>
        private MatchingState<TSet>?[] _stateArray;
 
        /// <summary>
        /// Maps state IDs to context-independent information for all states in <see cref="_stateArray"/>.
        /// The first valid entry is at index 1.
        /// </summary>
        private StateFlags[] _stateFlagsArray;
 
        /// <summary>Cached nullability info for each state ID.</summary>
        /// <remarks>
        /// _nullabilityArray[stateId] == the <see cref="MatchingState{TSet}.NullabilityInfo"/> for that state.
        /// Used to short-circuit nullability in the hot loop.
        /// Important: the pattern must not contain endZ for this to be valid.
        /// </remarks>
        private byte[] _nullabilityArray;
 
        /// <summary>
        /// The transition function for DFA mode.
        /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is
        /// the number of bits required to represent the largest minterm ID <see cref="_mintermsLog"/>, is reserved
        /// for each state. This makes indexing into this array not require a multiplication
        /// <see cref="DeltaOffset(int, int)"/>, but does mean some unused space may be present.
        /// The first valid state ID is 1.
        /// </summary>
        /// <remarks>
        /// For these "delta" arrays, technically Volatile.Read should be used to read out an element,
        /// but in practice that's not needed on the runtimes in use (though that needs to be documented
        /// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is
        /// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789).
        /// </remarks>
        private int[] _dfaDelta;
 
        /// <summary>
        /// Maps each NFA state id to the state id of the MatchingState stored in _stateArray.
        /// This map is used to compactly represent NFA state ids in NFA mode in order to utilize
        /// the property that all NFA states are small integers in one interval.
        /// The valid entries are 0 to the size of <see cref="_nfaIdByCoreId"/> - 1.
        /// </summary>
        private int[] _nfaCoreIdArray = [];
 
        /// <summary>
        /// Maps the id of a MatchingState to the NFA state id that it is being identified with in the NFA.
        /// It is the inverse of used entries in _nfaStateArray.
        /// The range of this map is 0 to its size - 1.
        /// </summary>
        private readonly Dictionary<int, int> _nfaIdByCoreId = [];
 
        /// <summary>
        /// Transition function for NFA transitions in NFA mode.
        /// Each NFA entry maps to a list of NFA target states.
        /// Each list of target states is without repetitions.
        /// If the entry is null then the targets states have not been computed yet.
        /// </summary>
        private int[]?[] _nfaDelta = [];
 
        /// <summary>
        /// The transition function for <see cref="FindSubcaptures(ReadOnlySpan{char}, int, int, PerThreadData)"/>,
        /// which is an NFA mode with additional state to track capture start and end positions.
        /// Each entry is an array of pairs of target state and effects to be applied when taking the transition.
        /// If the entry is null then the transition has not been computed yet.
        /// </summary>
        private (int, DerivativeEffect[])[]?[] _capturingNfaDelta = [];
 
        /// <summary>
        /// Implements a version of <see cref="Array.Resize"/> that is guaranteed to not publish an array before values
        /// have been copied over.
        /// </summary>
        /// <remarks>
        /// This may not be strictly necessary for arrays of primitive or reference types (which have atomic
        /// reads/writes), as when, e.g., <see cref="_dfaDelta"/> is found to not have an entry the array is checked again
        /// after a lock on the matcher has been acquired. However, in a highly threaded use case it still seems better
        /// to avoid unnecessarily causing other threads to acquire the lock.
        /// </remarks>
        private static void ArrayResizeAndVolatilePublish<T>(ref T[] array, int newSize)
        {
            Debug.Assert(newSize >= array.Length);
            T[] newArray = new T[newSize];
            Array.Copy(array, newArray, array.Length);
            Volatile.Write(ref array, newArray);
        }
 
        private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId;
 
        /// <summary>
        /// Pre-computed hot-loop version of nullability check
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private bool IsNullableWithContext(byte stateNullability, int mintermId) =>
            (stateNullability & (1 << (int)GetPositionKind(mintermId))) != 0;
 
        /// <summary>Returns the span from <see cref="_dfaDelta"/> that may contain transitions for the given state</summary>
        private Span<int> GetDeltasFor(MatchingState<TSet> state)
        {
            Debug.Assert(Monitor.IsEntered(this));
 
            int numMinterms = _minterms.Length;
            if (state.StartsWithLineAnchor)
            {
                numMinterms++;
            }
 
            return _dfaDelta.AsSpan(state.Id << _mintermsLog, numMinterms);
        }
 
        /// <summary>Returns the span from <see cref="_nfaDelta"/> that may contain transitions for the given state</summary>
        private Span<int[]?> GetNfaDeltasFor(MatchingState<TSet> state)
        {
            Debug.Assert(Monitor.IsEntered(this));
 
            if (!_nfaIdByCoreId.TryGetValue(state.Id, out int nfaState))
            {
                return default;
            }
 
            int numMinterms = _minterms.Length;
            if (state.StartsWithLineAnchor)
            {
                numMinterms++;
            }
 
            return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms);
        }
 
        /// <summary>
        /// Create a state with given node and previous character context.
        /// </summary>
        /// <param name="node">the pattern that this state will represent</param>
        /// <param name="prevCharKind">the kind of the character that led to this state</param>
        /// <returns></returns>
        private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint prevCharKind)
        {
            Debug.Assert(Monitor.IsEntered(this));
            return GetOrCreateState_NoLock(node, prevCharKind);
        }
 
        /// <summary>
        /// Analyze the specified reversed pattern to gather details that help to optimize the reverse matching process
        /// for when finding the beginning of a match.
        /// </summary>
        /// <remarks>
        /// Optimized reversal state computation during construction which skips the fixed length suffix, e.g. for the pattern abc.*def
        /// 1) the end is found at abc.*def|
        /// 2) the reversal starts at abc.*|
        /// </remarks>
        /// <param name="node">Reversed initial pattern</param>
        /// <returns>The match reversal details.</returns>
        private MatchReversalInfo<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
        {
            int pos = 0;
            while (true)
            {
                if (node._info.ContainsSomeAnchor)
                {
                    // Bail if it contains any anchors as it invalidates the optimization.
                    // (This could potentially be a very good future optimization for anchors but there's too many edge cases to guarantee it works.
                    // One example which fails currently: pattern: @"\By\b", input: "xy")
                    pos = 0;
                    break;
                }
 
                if (node._kind is not SymbolicRegexNodeKind.Concat)
                {
                    if (node._kind is SymbolicRegexNodeKind.CaptureStart)
                    {
                        node = _builder.Epsilon; // The entire match is fixed length.
                    }
                    break;
                }
 
                SymbolicRegexNode<TSet>? left = node._left;
                Debug.Assert(left is not null);
 
                if (left._kind is SymbolicRegexNodeKind.CaptureEnd or SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.Singleton)
                {
                    node = node._right!;
                    if (left._kind is SymbolicRegexNodeKind.Singleton)
                    {
                        pos++;
                    }
                }
                else if (left._kind is SymbolicRegexNodeKind.Loop)
                {
                    if (left._lower <= 0 || left._left!.Kind is not SymbolicRegexNodeKind.Singleton)
                    {
                        break;
                    }
 
                    node = left._lower == left._upper ?
                        node._right! : // The entire loop is fixed
                        _builder.CreateConcat( // Subtract the fixed part of the loop.
                            _builder.CreateLoop(left._left, left.IsLazy, 0, left._upper - left._lower),
                            node._right!);
                    pos += left._lower;
                }
                else
                {
                    break;
                }
            }
 
            Debug.Assert(pos >= 0);
            return
                pos == 0 ? new MatchReversalInfo<TSet>(MatchReversalKind.MatchStart, 0) :
                node == _builder.Epsilon ? new MatchReversalInfo<TSet>(MatchReversalKind.FixedLength, pos) :
                new MatchReversalInfo<TSet>(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(node), 0));
        }
 
        /// <summary>
        /// Create a state with given node and previous character context.
        /// </summary>
        /// <param name="node">the pattern that this state will represent</param>
        /// <param name="prevCharKind">the kind of the character that led to this state</param>
        /// <param name="isInitialState">whether to mark the state as an initial state or not</param>
        /// <returns></returns>
        private MatchingState<TSet> GetOrCreateState_NoLock(SymbolicRegexNode<TSet> node, uint prevCharKind, bool isInitialState = false)
        {
            SymbolicRegexNode<TSet> prunedNode = node.PruneAnchors(_builder, prevCharKind);
            (SymbolicRegexNode<TSet> Node, uint PrevCharKind) key = (prunedNode, prevCharKind);
            if (!_stateCache.TryGetValue(key, out MatchingState<TSet>? state))
            {
                state = new MatchingState<TSet>(key.Node, key.PrevCharKind);
                _stateCache.Add(key, state); // Add to cache first to make 1 the first state ID
                state.Id = _stateCache.Count;
 
                Debug.Assert(_stateArray is not null);
 
                if (state.Id == _stateArray.Length)
                {
                    // The growth factor 2 matches that of List<T>
                    int newsize = _stateArray.Length * 2;
                    ArrayResizeAndVolatilePublish(ref _stateArray, newsize);
                    ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog);
                    ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize);
                    ArrayResizeAndVolatilePublish(ref _nullabilityArray, newsize);
                }
                _stateArray[state.Id] = state;
                _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState);
                _nullabilityArray[state.Id] = (byte)state.NullabilityInfo;
            }
 
            return state;
        }
 
        /// <summary>
        /// Make an NFA state for the given node and previous character kind. NFA states include a "core state" of a
        /// <see cref="MatchingState{TSet}"/> allocated with <see cref="GetOrCreateState(SymbolicRegexNode{TSet}, uint)"/>,
        /// which stores the pattern and previous character kind and can be used for creating further NFA transitions.
        /// In addition to the ID of the core state, NFA states are allocated a new NFA mode specific ID, which is
        /// used to index into NFA mode transition arrays (e.g. <see cref="_nfaDelta"/>).
        /// </summary>
        /// <remarks>
        /// Using an ID numbering for NFA mode that is separate from DFA mode allows the IDs to be smaller, which saves
        /// space both in the NFA mode arrays and in the <see cref="SparseIntMap{T}"/> instances used during matching for
        /// sets of NFA states.
        /// The core state ID can be looked up by the NFA ID with <see cref="GetCoreStateId(int)"/>.
        /// </remarks>
        /// <returns>the NFA ID of the new state, or null if the state is a dead end</returns>
        private int? CreateNfaState(SymbolicRegexNode<TSet> node, uint prevCharKind)
        {
            Debug.Assert(Monitor.IsEntered(this));
            Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate);
 
            // First make the core state for the node, which is used for creating further transitions out of this state
            MatchingState<TSet> coreState = GetOrCreateState(node, prevCharKind);
 
            // If the state is a dead end then don't create an NFA state, as dead ends in NFA mode are represented
            // as empty lists of states.
            if (coreState.IsDeadend(Solver))
            {
                return null;
            }
 
            // The NFA state itself is an ID that can be mapped back to the ID of the MatchingState. These NFA states are
            // allocated separately from the IDs used in DFA mode to avoid large values, which helps save memory in the
            // SparseIntMap data structures used in NFA matching modes.
            if (!_nfaIdByCoreId.TryGetValue(coreState.Id, out int nfaStateId))
            {
                // No NFA state already exists, so make a new one. NFA state IDs are allocated sequentially from zero by
                // giving each new state an ID equal to the number of existing NFA states.
                nfaStateId = _nfaIdByCoreId.Count;
 
                // If the next ID is past the end of the NFA state array, increase the sizes of the NFA arrays
                if (nfaStateId == _nfaCoreIdArray.Length)
                {
                    // The growth factor 2 matches that of List<T>
                    int newsize = Math.Max(_nfaCoreIdArray.Length * 2, InitialNfaStateCapacity);
                    ArrayResizeAndVolatilePublish(ref _nfaCoreIdArray, newsize);
                    ArrayResizeAndVolatilePublish(ref _nfaDelta, newsize << _mintermsLog);
                    ArrayResizeAndVolatilePublish(ref _capturingNfaDelta, newsize << _mintermsLog);
                }
 
                // Store the mapping from NFA state ID to core state ID
                Debug.Assert(nfaStateId < _nfaCoreIdArray.Length);
                _nfaCoreIdArray[nfaStateId] = coreState.Id;
 
                // Store the mapping from core state ID to NFA state ID
                // Adding an entry here increments the ID that will be given to the next NFA state
                _nfaIdByCoreId.Add(coreState.Id, nfaStateId);
            }
 
            return nfaStateId;
        }
 
        /// <summary>Gets the <see cref="MatchingState{TSet}"/> corresponding to the given state ID.</summary>
        private MatchingState<TSet> GetState(int stateId)
        {
            Debug.Assert(stateId > 0);
            MatchingState<TSet>? state = _stateArray[stateId];
            Debug.Assert(state is not null);
            return state;
        }
 
        /// <summary>Gets the core state Id corresponding to the NFA state</summary>
        private int GetCoreStateId(int nfaStateId)
        {
            Debug.Assert(nfaStateId < _nfaCoreIdArray.Length);
            Debug.Assert(_nfaCoreIdArray[nfaStateId] < _stateArray.Length);
            return _nfaCoreIdArray[nfaStateId];
        }
 
        /// <summary>Gets or creates a new DFA transition.</summary>
        /// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
        private bool TryCreateNewTransition(MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, long timeoutOccursAt, [NotNullWhen(true)] out MatchingState<TSet>? nextState)
        {
            Debug.Assert(offset < _dfaDelta.Length);
            lock (this)
            {
                // check if meanwhile delta[offset] has become defined possibly by another thread
                MatchingState<TSet>? targetState = _stateArray[_dfaDelta[offset]];
                if (targetState is null)
                {
                    if ((timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || // if there's an active timer
                        (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)) // if # of nodes exceeds the NFA threshold
                    {
                        nextState = null;
                        return false;
                    }
 
                    TSet minterm = GetMintermFromId(mintermId);
                    uint nextCharKind = GetPositionKind(mintermId);
                    targetState = GetOrCreateState(sourceState.Next(_builder, minterm, nextCharKind), nextCharKind);
                    Volatile.Write(ref _dfaDelta[offset], targetState.Id);
                }
 
                nextState = targetState;
                return true;
            }
        }
 
        /// <summary>Gets or creates a new NFA transition.</summary>
        /// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
        private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset)
        {
            Debug.Assert(nfaOffset < _nfaDelta.Length);
 
            lock (this)
            {
                // check if meanwhile the nfaoffset has become defined possibly by another thread
                int[]? targets = _nfaDelta[nfaOffset];
                if (targets is null)
                {
                    // Create the underlying transition from the core state corresponding to the nfa state
                    int coreId = GetCoreStateId(nfaStateId);
                    int coreOffset = (coreId << _mintermsLog) | mintermId;
                    int coreTargetId = _dfaDelta[coreOffset];
                    MatchingState<TSet> coreState = GetState(coreId);
                    TSet minterm = GetMintermFromId(mintermId);
                    uint nextCharKind = GetPositionKind(mintermId);
                    SymbolicRegexNode<TSet> targetNode = coreTargetId > 0 ?
                        GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind);
 
                    List<int> targetsList = [];
                    ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List<int> targetsList) =>
                        targetsList.Add(nfaId));
 
                    targets = targetsList.ToArray();
                    Volatile.Write(ref _nfaDelta[nfaOffset], targets);
                }
 
                return targets;
            }
        }
 
        /// <summary>Gets or creates a new capturing NFA transition.</summary>
        /// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
        private (int, DerivativeEffect[])[] CreateNewCapturingTransition(int nfaStateId, int mintermId, int offset)
        {
            lock (this)
            {
                // Get the next state if it exists.  The caller should have already tried and found it null (not yet created),
                // but in the interim another thread could have created it.
                (int, DerivativeEffect[])[]? targets = _capturingNfaDelta[offset];
                if (targets is null)
                {
                    MatchingState<TSet> coreState = GetState(GetCoreStateId(nfaStateId));
                    TSet minterm = GetMintermFromId(mintermId);
                    uint nextCharKind = GetPositionKind(mintermId);
                    List<(SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind);
 
                    // Build the new state and store it into the array.
                    List<(int, DerivativeEffect[])> targetsList = [];
                    foreach ((SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects) entry in transition)
                    {
                        ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects),
                            static (int nfaId, (List<(int, DerivativeEffect[])> Targets, DerivativeEffect[] Effects) args) =>
                                args.Targets.Add((nfaId, args.Effects)));
                    }
                    targets = targetsList.ToArray();
                    Volatile.Write(ref _capturingNfaDelta[offset], targets);
                }
 
                return targets;
            }
        }
 
        /// <summary>
        /// Iterates through the alternation branches <see cref="SymbolicRegexNode{TSet}.EnumerateAlternationBranches(SymbolicRegexBuilder{TSet})"/>
        /// and tries to create NFA states for each. The supplied action is called for each created NFA state. These never
        /// include dead ends as <see cref="CreateNfaState(SymbolicRegexNode{TSet}, uint)"/> will filter those out.
        /// </summary>
        /// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
        /// <typeparam name="T">the type of the additional argument passed through to the action</typeparam>
        /// <param name="node">the node to break up into NFA states</param>
        /// <param name="prevCharKind">the previous character kind for each created NFA state</param>
        /// <param name="arg">an additional argument passed through to each call to the action</param>
        /// <param name="action">action to call for each NFA state</param>
        private void ForEachNfaState<T>(SymbolicRegexNode<TSet> node, uint prevCharKind, T arg, Action<int, T> action)
        {
            lock (this)
            {
                foreach (SymbolicRegexNode<TSet> nfaNode in node.EnumerateAlternationBranches(_builder))
                {
                    if (CreateNfaState(nfaNode, prevCharKind) is int nfaId)
                    {
                        action(nfaId, arg);
                    }
                }
            }
        }
    }
}
File: System\Text\RegularExpressions\Symbolic\SymbolicRegexMatcher.Automata.cs	Web Access
Project: src\src\libraries\System.Text.RegularExpressions\src\System.Text.RegularExpressions.csproj (System.Text.RegularExpressions)