RegexRunner.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.ComponentModel;
using System.Runtime.CompilerServices;

namespace System.Text.RegularExpressions
{
    /// <summary>
    /// Base class for source-generated regex extensibility
    /// (and the old CompileToAssembly extensibility).
    /// It's not intended to be used by anything else.
    /// </summary>
    /// <remarks>
    /// Provides the driver code that calls the subclass's Scan
    /// method for either scanning or direct execution.
    /// Also maintains memory allocation for the backtracking stack,
    /// the grouping stack and the longjump crawlstack, and provides
    /// methods to push new subpattern match results into (or remove
    /// backtracked results from) the Match instance.
    /// </remarks>
    [EditorBrowsable(EditorBrowsableState.Never)]
    public abstract class RegexRunner
    {
        /// <summary>Index of the first character to search</summary>
        /// <remarks>
        /// We now always use a sliced span of the input
        /// from runtextbeg to runtextend, which means that runtextbeg is now always 0 except
        /// for CompiledToAssembly scenario which works over the original input.
        /// </remarks>
        protected internal int runtextbeg;

        /// <summary>Index just past the last character to search</summary>
        /// <remarks>
        /// Because we now pass in a sliced span of the input into Scan,
        /// the runtextend will always match the length of that passed in span except for CompileToAssembly
        /// scenario, which still works over the original input.
        /// </remarks>
        protected internal int runtextend;

        /// <summary>Index of the starting character for the search.</summary>
        /// <remarks>
        /// The differs from <see cref="runtextbeg"/> in that lookbehinds will be able to see text before
        /// <see cref="runtextstart"/> but not before <see cref="runtextbeg"/>.
        /// </remarks>
        protected internal int runtextstart;

        /// <summary>Text to search. May be null if the input was supplied as a span.</summary>
        protected internal string? runtext;

        /// <summary>Current position in text</summary>
        protected internal int runtextpos;

        /// <summary>Backtracking stack</summary>
        /// <remarks>
        /// Opcodes use this to store data regarding
        /// what they have matched and where to backtrack to.  Each "frame" on
        /// the stack takes the form of [CodePosition Data1 Data2...], where
        /// CodePosition is the position of the current opcode and
        /// the data values are all optional.  The CodePosition can be negative, and
        /// these values (also called "back2") are used by the BranchMark family of opcodes
        /// to indicate whether they are backtracking after a successful or failed
        /// match.
        /// When we backtrack, we pop the CodePosition off the stack, set the current
        /// instruction pointer to that code position, and mark the opcode
        /// with a backtracking flag ("Back").  Each opcode then knows how to
        /// handle its own data.
        /// </remarks>
        protected internal int[]? runtrack;
        /// <summary>Backtracking stack position</summary>
        protected internal int runtrackpos;

        /// <summary>Utility stack</summary>
        /// <remarks>
        /// This stack is used to track text positions across different opcodes.
        /// For example, in /(a*b)+/, the parentheses result in a SetMark/CaptureMark
        /// pair. SetMark records the text position before we match a*b.  Then
        /// CaptureMark uses that position to figure out where the capture starts.
        /// Opcodes which push onto this stack are always paired with other opcodes
        /// which will pop the value from it later.  A successful match should mean
        /// that this stack is empty.
        /// </remarks>
        protected internal int[]? runstack;
        /// <summary>Utility stack position</summary>
        protected internal int runstackpos;

        /// <summary>Crawl stack</summary>
        /// <remarks>
        /// Every time a group has a capture, we push its group number onto the runcrawl stack.
        /// In the case of a balanced match, we push BOTH groups onto the stack.
        /// </remarks>
        protected internal int[]? runcrawl;
        /// <summary>Crawl stack position</summary>
        protected internal int runcrawlpos;

        /// <summary>Count of states that may do backtracking</summary>
        protected internal int runtrackcount;

        /// <summary>Result object</summary>
        protected internal Match? runmatch;
        /// <summary>Regex object</summary>
        protected internal Regex? runregex;

        /// <summary>Mode in which the runner is operating</summary>
        private protected RegexRunnerMode _mode;

        /// <summary>Timeout in milliseconds</summary>
        private int _timeout;
        private bool _checkTimeout;
        private long _timeoutOccursAt;

        /// <summary>Initializes a new instance of the <see cref="RegexRunner"/> class.</summary>
        protected RegexRunner() { }

        /// <summary>Used by a <see cref="Regex"/> object to scan the input <paramref name="text"/> looking for the next match.</summary>
        /// <remarks>This API supports the product infrastructure and is not intended to be used directly from your code.</remarks>
        /// <param name="text">The text to scan for a pattern match.</param>
        /// <exception cref="NotSupportedException">
        /// <see cref="ReadOnlySpan{T}"/>-based <see cref="Regex"/> methods are not supported from <see cref="Regex"/>-derived types
        /// generated by Regex.CompileToAssembly.
        /// </exception>
        protected internal virtual void Scan(ReadOnlySpan<char> text)
        {
            // This base implementation is overridden by all of the built-in engines and by all source-generated
            // implementations.  The only time this should end up being used is if someone is using a Regex-derived
            // type created by .NET Framework's Regex.CompileToAssembly, in which case it will have overridden
            // FindFirstChar and Go but not Scan (which didn't exist yet).  This isn't an officially supported configuration,
            // using assemblies built for .NET Framework and targeting .NET Framework surface area against this
            // implementation, but we make a best-effort to keep things functional.
            string? s = runtext;

            // We can assume that the passed in 'text' span is a slice of the original text input runtext. That said we need to calculate
            // what the original beginning was and can't do it by just using the lengths of text and runtext, since we can't guarantee that
            // the passed in beginning and length match the size of the original input. We instead use MemoryExtensions Overlaps to find the
            // offset in memory between them. We intentionally use s.Overlaps(text) since we want to get a positive value.
            s.AsSpan().Overlaps(text, out int beginning);

            // The passed in span is sliced from runtextbeg to runtextend already, but in the precompiled scenario
            // we require to use the complete input and to use the full string instead. We first test to ensure that the
            // passed in span matches the original input by using the original runtextbeg. If that is not the case,
            // then it means the user is calling the new span-based APIs using CompiledToAssembly, so we throw NSE
            // so as to prevent a lot of unexpected allocations.
            if (s == null || text != s.AsSpan(beginning, text.Length))
            {
                // If we landed here then we are dealing with a CompiledToAssembly case where the new Span overloads are being called.
                throw new NotSupportedException(SR.UsingSpanAPIsWithCompiledToAssembly);
            }

            // If the original beginning wasn't zero, then we have to adjust some of the
            // internal fields of RegexRunner to ensure the Precompiled Go and FFC methods
            // will continue to work as expected since they work over the original input, as opposed
            // to using the sliced span.
            if (beginning != 0)
            {
                runtextbeg = beginning;
                runtextstart += beginning;
                runtextend += beginning;
            }

            InternalScan(runregex!, beginning, beginning + text.Length);
        }

        /// <summary>Used by a <see cref="Regex"/> object to scan the input <paramref name="text"/>.</summary>
        [Obsolete(Obsoletions.RegexExtensibilityImplMessage, DiagnosticId = Obsoletions.RegexExtensibilityDiagId, UrlFormat = Obsoletions.SharedUrlFormat)]
        protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) =>
            Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout);

        /// <summary>
        /// This method's body is only kept since it is a protected member that could be called by someone outside
        /// the assembly.
        /// </summary>
        [Obsolete(Obsoletions.RegexExtensibilityImplMessage, DiagnosticId = Obsoletions.RegexExtensibilityDiagId, UrlFormat = Obsoletions.SharedUrlFormat)]
        protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout)
        {
            InitializeTimeout(timeout);

            RegexRunnerMode mode = quick ? RegexRunnerMode.ExistenceRequired : RegexRunnerMode.FullMatchRequired;

            // We set runtext before calling InitializeForScan so that runmatch object is initialized with the text
            runtext = text;

            InitializeForScan(regex, text, textstart, mode);

            // InitializeForScan will default runtextstart and runtextend to 0 and length of string
            // since it is configured to work over a sliced portion of text so we adjust those values.
            runtextstart = textstart;
            runtextend = textend;

            // Configure the additional value to "bump" the position along each time we loop around
            // to call FindFirstChar again, as well as the stopping position for the loop.  We generally
            // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump
            // by -1 and stop at textbeg.
            int bump = 1, stoppos = textend;
            if (regex.RightToLeft)
            {
                bump = -1;
                stoppos = textbeg;
            }

            // If previous match was empty or failed, advance by one before matching.
            if (prevlen == 0)
            {
                if (textstart == stoppos)
                {
                    return Match.Empty;
                }

                runtextpos += bump;
            }

            Match match = InternalScan(regex, textbeg, textend);
            runtext = null; //drop reference

            if (match.FoundMatch)
            {
                if (quick)
                {
                    return null;
                }

                runmatch = null;
                match.Tidy(runtextpos, 0, mode);
            }
            else
            {
                runmatch!.Text = null;
            }

            return match;

        }

        private Match InternalScan(Regex regex, int textbeg, int textend)
        {
            // Configure the additional value to "bump" the position along each time we loop around
            // to call FindFirstChar again, as well as the stopping position for the loop.  We generally
            // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump
            // by -1 and stop at textbeg.
            int bump = 1, stoppos = textend;
            if (regex.RightToLeft)
            {
                bump = -1;
                stoppos = textbeg;
            }

            while (true)
            {
                // Find the next potential location for a match in the input.
                if (FindFirstChar())
                {
                    CheckTimeout();

                    // See if there's a match at this position.
                    Go();

                    if (runmatch!.FoundMatch)
                    {
                        return runmatch;
                    }

                    // Reset state for another iteration.
                    runtrackpos = runtrack!.Length;
                    runstackpos = runstack!.Length;
                    runcrawlpos = runcrawl!.Length;
                }

                // We failed to match at this position.  If we're at the stopping point, we're done.
                if (runtextpos == stoppos)
                {
                    return Match.Empty;
                }

                // Bump by one (in whichever direction is appropriate) and loop to go again.
                runtextpos += bump;
            }
        }

        internal void InitializeForScan(Regex regex, ReadOnlySpan<char> text, int textstart, RegexRunnerMode mode)
        {
            // Store remaining arguments into fields now that we're going to start the scan.
            // These are referenced by the derived runner.
            _mode = mode;
            runregex = regex;
            runtextstart = textstart;
            runtextbeg = 0;
            runtextend = text.Length;
            runtextpos = textstart;

            Match? m = runmatch;
            if (m is null)
            {
                // Use a hashtabled Match object if the capture numbers are sparse
                runmatch = runregex!.caps is null ?
                    new Match(runregex, runregex.capsize, runtext, text.Length) :
                    new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext, text.Length);
            }
            else
            {
                m.Reset(runtext, text.Length);
            }

            // Note we test runcrawl, because it is the last one to be allocated
            // If there is an alloc failure in the middle of the three allocations,
            // we may still return to reuse this instance, and we want to behave
            // as if the allocations didn't occur.
            if (runcrawl != null)
            {
                runtrackpos = runtrack!.Length;
                runstackpos = runstack!.Length;
                runcrawlpos = runcrawl.Length;
                return;
            }

            // Everything above runs once per match.
            // Everything below runs once per runner.

            InitTrackCount();

            int stacksize;
            int tracksize = stacksize = runtrackcount * 8;

            if (tracksize < 32)
            {
                tracksize = 32;
            }
            if (stacksize < 16)
            {
                stacksize = 16;
            }

            runtrack = new int[tracksize];
            runtrackpos = tracksize;

            runstack = new int[stacksize];
            runstackpos = stacksize;

            runcrawl = new int[32];
            runcrawlpos = 32;
        }

        internal void InitializeTimeout(TimeSpan timeout)
        {
            // Handle timeout argument
            _checkTimeout = false;
            if (Regex.InfiniteMatchTimeout != timeout)
            {
                ConfigureTimeout(timeout);

                void ConfigureTimeout(TimeSpan timeout)
                {
                    _checkTimeout = true;
                    _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round;
                    _timeoutOccursAt = Environment.TickCount64 + _timeout;
                }
            }
        }

        /// <summary>Checks whether the operation has timed out, and throws <see cref="RegexMatchTimeoutException"/> if it has.</summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        protected internal void CheckTimeout()
        {
            if (_checkTimeout && Environment.TickCount64 >= _timeoutOccursAt)
            {
                ThrowRegexTimeout();
            }

            void ThrowRegexTimeout() => throw new RegexMatchTimeoutException(runtext ?? string.Empty, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout));
        }

        /// <summary>
        /// The responsibility of Go() is to run the regular expression at
        /// runtextpos and call Capture() on all the captured subexpressions,
        /// then to leave runtextpos at the ending position. It should leave
        /// runtextpos where it started if there was no match.
        /// </summary>
        protected virtual void Go() => throw new NotImplementedException();

        /// <summary>
        /// The responsibility of FindFirstChar() is to advance runtextpos
        /// until it is at the next position which is a candidate for the
        /// beginning of a successful match.
        /// </summary>
        protected virtual bool FindFirstChar() => throw new NotImplementedException();

        /// <summary>
        /// InitTrackCount must initialize the runtrackcount field; this is
        /// used to know how large the initial runtrack and runstack arrays
        /// must be.
        /// </summary>
        protected virtual void InitTrackCount() { }

        /// <summary>
        /// Called by the implementation of Go() to increase the size of storage
        /// </summary>
        protected void EnsureStorage()
        {
            int limit = runtrackcount * 4;

            if (runstackpos < limit)
                DoubleStack();

            if (runtrackpos < limit)
                DoubleTrack();
        }

        /// <summary>
        /// Called by the implementation of Go() to decide whether the pos
        /// at the specified index is a boundary or not. It's just not worth
        /// emitting inline code for this logic.
        /// </summary>
        protected bool IsBoundary(int index, int startpos, int endpos)
        {
            return (index > startpos && RegexCharClass.IsBoundaryWordChar(runtext![index - 1])) !=
                   (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext![index]));
        }

        internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)
        {
            int indexM1 = index - 1;
            return ((uint)indexM1 < (uint)inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[indexM1])) !=
                   ((uint)index < (uint)inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[index]));
        }

        /// <summary>Determines whether the specified index is a boundary.</summary>",
        /// <remarks>This variant is only employed when the subsequent character will separately be validated as a word character.</remarks>",
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool IsPreWordCharBoundary(ReadOnlySpan<char> inputSpan, int index)
        {
            int indexMinus1 = index - 1;
            return (uint)indexMinus1 >= (uint)inputSpan.Length || !RegexCharClass.IsBoundaryWordChar(inputSpan[indexMinus1]);
        }

        /// <summary>Determines whether the specified index is a boundary.</summary>
        /// <remarks>This variant is only employed when the previous character has already been validated as a word character.</remarks>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool IsPostWordCharBoundary(ReadOnlySpan<char> inputSpan, int index) =>
            (uint)index >= (uint)inputSpan.Length || !RegexCharClass.IsBoundaryWordChar(inputSpan[index]);

        /// <summary>Called to determine a char's inclusion in the \w set.</summary>
        internal static bool IsWordChar(char ch) => RegexCharClass.IsWordChar(ch);

        /// <summary>Determines whether the position at the specified index is an ECMA-compatible word boundary.</summary>
        protected bool IsECMABoundary(int index, int startpos, int endpos)
        {
            return (index > startpos && RegexCharClass.IsECMAWordChar(runtext![index - 1])) !=
                   (index < endpos && RegexCharClass.IsECMAWordChar(runtext![index]));
        }

        internal static bool IsECMABoundary(ReadOnlySpan<char> inputSpan, int index)
        {
            int indexM1 = index - 1;
            return ((uint)indexM1 < (uint)inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[indexM1])) !=
                   ((uint)index < (uint)inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[index]));
        }

        /// <summary>Determines whether the specified character is in the given character set and category.</summary>
        [Obsolete(Obsoletions.RegexExtensibilityImplMessage, DiagnosticId = Obsoletions.RegexExtensibilityDiagId, UrlFormat = Obsoletions.SharedUrlFormat)]
        protected static bool CharInSet(char ch, string set, string category)
        {
            string charClass = RegexCharClass.ConvertOldStringsToClass(set, category);
            return RegexCharClass.CharInClass(ch, charClass);
        }

        /// <summary>Determines whether the specified character matches the given character class string.</summary>
        public static bool CharInClass(char ch, string charClass)
        {
            return RegexCharClass.CharInClass(ch, charClass);
        }

        /// <summary>
        /// Called by the implementation of Go() to increase the size of the
        /// backtracking stack.
        /// </summary>
        protected void DoubleTrack()
        {
            int[] newtrack = new int[runtrack!.Length * 2];

            Array.Copy(runtrack, 0, newtrack, runtrack.Length, runtrack.Length);
            runtrackpos += runtrack.Length;
            runtrack = newtrack;
        }

        /// <summary>
        /// Called by the implementation of Go() to increase the size of the
        /// grouping stack.
        /// </summary>
        protected void DoubleStack()
        {
            int[] newstack = new int[runstack!.Length * 2];

            Array.Copy(runstack, 0, newstack, runstack.Length, runstack.Length);
            runstackpos += runstack.Length;
            runstack = newstack;
        }

        /// <summary>
        /// Increases the size of the longjump unrolling stack.
        /// </summary>
        protected void DoubleCrawl()
        {
            int[] newcrawl = new int[runcrawl!.Length * 2];

            Array.Copy(runcrawl, 0, newcrawl, runcrawl.Length, runcrawl.Length);
            runcrawlpos += runcrawl.Length;
            runcrawl = newcrawl;
        }

        /// <summary>
        /// Save a number on the longjump unrolling stack
        /// </summary>
        protected void Crawl(int i)
        {
            if (runcrawlpos == 0)
                DoubleCrawl();

            runcrawl![--runcrawlpos] = i;
        }

        /// <summary>
        /// Remove a number from the longjump unrolling stack
        /// </summary>
        protected int Popcrawl()
        {
            return runcrawl![runcrawlpos++];
        }

        /// <summary>
        /// Get the height of the stack
        /// </summary>
        protected int Crawlpos()
        {
            return runcrawl!.Length - runcrawlpos;
        }

        /// <summary>
        /// Called by Go() to capture a subexpression. Note that the
        /// capnum used here has already been mapped to a non-sparse
        /// index (by the code generator RegexWriter).
        /// </summary>
        protected void Capture(int capnum, int start, int end)
        {
            if (end < start)
            {
                int t = end;
                end = start;
                start = t;
            }

            Crawl(capnum);
            runmatch!.AddMatch(capnum, start, end - start);
        }

        /// <summary>
        /// Called by Go() to capture a subexpression. Note that the
        /// capnum used here has already been mapped to a non-sparse
        /// index (by the code generator RegexWriter).
        /// </summary>
        protected void TransferCapture(int capnum, int uncapnum, int start, int end)
        {
            // these are the two intervals that are canceling each other

            if (end < start)
            {
                int t = end;
                end = start;
                start = t;
            }

            int start2 = MatchIndex(uncapnum);
            int end2 = start2 + MatchLength(uncapnum);

            // The new capture gets the innermost defined interval

            if (start >= end2)
            {
                end = start;
                start = end2;
            }
            else if (end <= start2)
            {
                start = start2;

                // Ensure we don't create a capture with negative length
                // When the balancing capture precedes the balanced group, end might be less than the new start
                if (end < start)
                {
                    end = start;
                }
            }
            else
            {
                if (end > end2)
                    end = end2;
                if (start2 > start)
                    start = start2;
            }

            Crawl(uncapnum);
            runmatch!.BalanceMatch(uncapnum);

            if (capnum != -1)
            {
                Crawl(capnum);
                runmatch.AddMatch(capnum, start, end - start);
            }
        }

        /// <summary>Called by <see cref="Go"/> to revert the last capture.</summary>
        protected void Uncapture()
        {
            int capnum = Popcrawl();
            runmatch!.RemoveMatch(capnum);
        }

        /// <summary>
        /// Call out to runmatch to get around visibility issues
        /// </summary>
        protected bool IsMatched(int cap)
        {
            return runmatch!.IsMatched(cap);
        }

        /// <summary>
        /// Call out to runmatch to get around visibility issues
        /// </summary>
        protected int MatchIndex(int cap)
        {
            return runmatch!.MatchIndex(cap);
        }

        /// <summary>
        /// Call out to runmatch to get around visibility issues
        /// </summary>
        protected int MatchLength(int cap)
        {
            return runmatch!.MatchLength(cap);
        }
    }
}
File: System\Text\RegularExpressions\RegexRunner.cs	Web Access
Project: src\runtime\src\libraries\System.Text.RegularExpressions\src\System.Text.RegularExpressions.csproj (System.Text.RegularExpressions)