AhoCorasick.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Diagnostics;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
 
namespace System.Buffers
{
    /// <summary>
    /// An implementation of the Aho-Corasick algorithm we use as a fallback when we can't use Teddy
    /// (either due to missing hardware intrinsics, or due to characteristics of the values used).
    /// https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
    /// Works in O(n).
    /// </summary>
    internal readonly struct AhoCorasick
    {
        private readonly AhoCorasickNode[] _nodes;
        private readonly IndexOfAnyAsciiSearcher.AsciiState _startingAsciiChars;
 
        public AhoCorasick(AhoCorasickNode[] nodes, IndexOfAnyAsciiSearcher.AsciiState startingAsciiChars)
        {
            _nodes = nodes;
            _startingAsciiChars = startingAsciiChars;
        }
 
        public readonly bool ShouldUseAsciiFastScan
        {
            get
            {
                if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && _startingAsciiChars.Bitmap != default)
                {
                    // If there are a lot of starting characters such that we often find one early,
                    // the ASCII fast scan may end up performing worse than checking one character at a time.
                    // Avoid using this optimization if the combined frequency of starting chars is too high.
                    //
                    // For reference, the combined frequency of characters based on CharacterFrequencyHelper.AsciiFrequency:
                    // - All digits is ~ 5 %
                    // - All lowercase letters is ~ 57.2 %
                    // - All uppercase letters is ~ 7.4 %
                    //
                    // This limit is based on experimentation with different texts and sets of values.
                    // Above ~50 %, the cost of calling into the vectorized helper is higher than checking char by char on average.
                    const float MaxCombinedFrequency = 50f;
 
                    float frequency = 0;
 
                    for (int i = 0; i < 128; i++)
                    {
                        if (_startingAsciiChars.Lookup.Contains256((char)i))
                        {
                            frequency += CharacterFrequencyHelper.AsciiFrequency[i];
                        }
                    }
 
                    return frequency <= MaxCombinedFrequency;
                }
 
                return false;
            }
        }
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public readonly int IndexOfAny<TCaseSensitivity, TFastScanVariant>(ReadOnlySpan<char> span)
            where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
            where TFastScanVariant : struct, IFastScan
        {
            return typeof(TCaseSensitivity) == typeof(StringSearchValuesHelper.CaseInsensitiveUnicode)
                ? IndexOfAnyCaseInsensitiveUnicode<TFastScanVariant>(span)
                : IndexOfAnyCore<TCaseSensitivity, TFastScanVariant>(span);
        }
 
        private readonly int IndexOfAnyCore<TCaseSensitivity, TFastScanVariant>(ReadOnlySpan<char> span)
            where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
            where TFastScanVariant : struct, IFastScan
        {
            Debug.Assert(typeof(TCaseSensitivity) != typeof(StringSearchValuesHelper.CaseInsensitiveUnicode));
 
            ref AhoCorasickNode nodes = ref MemoryMarshal.GetArrayDataReference(_nodes);
            int nodeIndex = 0;
            int result = -1;
            int i = 0;
 
        FastScan:
            Debug.Assert(nodeIndex == 0);
            // We are currently in the root node and trying to find the next position of any starting character.
            // If all the values start with an ASCII character, use a vectorized helper to quickly skip over characters that can't start a match.
            if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && typeof(TFastScanVariant) == typeof(IndexOfAnyAsciiFastScan))
            {
                int remainingLength = span.Length - i;
 
                if (remainingLength >= Vector128<ushort>.Count)
                {
                    // If '\0' is one of the starting chars and we're running on Ssse3 hardware, this may return false-positives.
                    // False-positives here are okay, we'll just rule them out below. While we could flow the Ssse3AndWasmHandleZeroInNeedle
                    // generic through, we expect such values to be rare enough that introducing more code is not worth it.
                    int offset = IndexOfAnyAsciiSearcher.IndexOfAny<IndexOfAnyAsciiSearcher.DontNegate, IndexOfAnyAsciiSearcher.Default, SearchValues.FalseConst>(
                        ref Unsafe.As<char, short>(ref Unsafe.Add(ref MemoryMarshal.GetReference(span), i)),
                        remainingLength,
                        ref Unsafe.AsRef(in _startingAsciiChars));
 
                    if (offset < 0)
                    {
                        goto Return;
                    }
 
                    i += offset;
                    goto LoopWithoutRangeCheck;
                }
            }
 
        Loop:
            if ((uint)i >= (uint)span.Length)
            {
                goto Return;
            }
 
        LoopWithoutRangeCheck:
            // Read the next input character and either find the next potential match prefix or transition back to the root node.
            Debug.Assert((uint)i < (uint)span.Length);
            char c = TCaseSensitivity.TransformInput(Unsafe.Add(ref MemoryMarshal.GetReference(span), i));
 
            while (true)
            {
                Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
                ref AhoCorasickNode node = ref Unsafe.Add(ref nodes, (uint)nodeIndex);
 
                if (node.TryGetChild(c, out int childIndex))
                {
                    // We were able to extend the current match. If this node contains a potential match, remember that.
                    nodeIndex = childIndex;
 
                    Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
                    int matchLength = Unsafe.Add(ref nodes, (uint)nodeIndex).MatchLength;
                    if (matchLength != 0)
                    {
                        // Any result we find from here on out may only be lower (longer match with a start closer to the beginning of the input).
                        Debug.Assert(result == -1 || result >= i + 1 - matchLength);
                        result = i + 1 - matchLength;
                    }
 
                    i++;
                    goto Loop;
                }
 
                if (nodeIndex == 0)
                {
                    // We are back at the root node and none of the values start with the current character.
                    if (result >= 0)
                    {
                        // If we've already found a match, we can't find an earlier one anymore. This is the result
                        goto Return;
                    }
 
                    // Go back to searching for the next possible starting character.
                    i++;
                    goto FastScan;
                }
 
                // Follow the next suffix link.
                nodeIndex = node.SuffixLink;
 
                if (nodeIndex < 0)
                {
                    // A node with a suffix link of -1 indicates a match, see AhoCorasickBuilder.AddSuffixLinks.
                    Debug.Assert(nodeIndex == -1);
                    Debug.Assert(result >= 0);
                    goto Return;
                }
 
                // Try to match the current character again at the suffix link node.
            }
 
        Return:
            return result;
        }
 
        // Mostly a copy of IndexOfAnyCore, but we may read two characters at a time in the case of surrogate pairs.
        private readonly int IndexOfAnyCaseInsensitiveUnicode<TFastScanVariant>(ReadOnlySpan<char> span)
            where TFastScanVariant : struct, IFastScan
        {
            const char LowSurrogateNotSet = '\0';
 
            ref AhoCorasickNode nodes = ref MemoryMarshal.GetArrayDataReference(_nodes);
            int nodeIndex = 0;
            int result = -1;
            int i = 0;
            char lowSurrogateUpper = LowSurrogateNotSet;
 
        FastScan:
            // We are currently in the root node and trying to find the next position of any starting character.
            // If all the values start with an ASCII character, use a vectorized helper to quickly skip over characters that can't start a match.
            if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && typeof(TFastScanVariant) == typeof(IndexOfAnyAsciiFastScan))
            {
                if (lowSurrogateUpper != LowSurrogateNotSet)
                {
                    // We read a surrogate pair in the previous loop iteration and processed the high surrogate.
                    // Continue with the stored low surrogate.
                    goto LoopWithoutRangeCheck;
                }
 
                int remainingLength = span.Length - i;
 
                if (remainingLength >= Vector128<ushort>.Count)
                {
                    int offset = IndexOfAnyAsciiSearcher.IndexOfAny<IndexOfAnyAsciiSearcher.DontNegate, IndexOfAnyAsciiSearcher.Default, SearchValues.FalseConst>(
                        ref Unsafe.As<char, short>(ref Unsafe.Add(ref MemoryMarshal.GetReference(span), i)),
                        remainingLength,
                        ref Unsafe.AsRef(in _startingAsciiChars));
 
                    if (offset < 0)
                    {
                        goto Return;
                    }
 
                    i += offset;
                    goto LoopWithoutRangeCheck;
                }
            }
 
        Loop:
            if ((uint)i >= (uint)span.Length)
            {
                goto Return;
            }
 
        LoopWithoutRangeCheck:
            // Read the next input character and either find the next potential match prefix or transition back to the root node.
            Debug.Assert((uint)i < (uint)span.Length);
            char c;
            if (lowSurrogateUpper != LowSurrogateNotSet)
            {
                // We have just processed the high surrogate. Continue with the low surrogate we read in the previous iteration.
                c = lowSurrogateUpper;
                lowSurrogateUpper = LowSurrogateNotSet;
            }
            else
            {
                // Read the next character, check if it's a high surrogate, and transform it to its Ordinal uppercase representation.
                c = Unsafe.Add(ref MemoryMarshal.GetReference(span), i);
                char lowSurrogate;
 
                if (char.IsHighSurrogate(c) &&
                    (uint)(i + 1) < (uint)span.Length &&
                    char.IsLowSurrogate(lowSurrogate = Unsafe.Add(ref MemoryMarshal.GetReference(span), i + 1)))
                {
                    if (GlobalizationMode.UseNls)
                    {
                        SurrogateToUpperNLS(c, lowSurrogate, out c, out lowSurrogateUpper);
                    }
                    else
                    {
                        SurrogateCasing.ToUpper(c, lowSurrogate, out c, out lowSurrogateUpper);
                    }
 
                    Debug.Assert(lowSurrogateUpper != LowSurrogateNotSet);
                }
                else
                {
                    c = TextInfo.ToUpperOrdinal(c);
                }
 
#if DEBUG
                // The above logic must match Ordinal.ToUpperOrdinal exactly.
                Span<char> destination = new char[2]; // Avoid stackalloc in a loop
                Ordinal.ToUpperOrdinal(span.Slice(i, i + 1 == span.Length ? 1 : 2), destination);
                Debug.Assert(c == destination[0]);
                Debug.Assert(lowSurrogateUpper == LowSurrogateNotSet || lowSurrogateUpper == destination[1]);
#endif
            }
 
            while (true)
            {
                Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
                ref AhoCorasickNode node = ref Unsafe.Add(ref nodes, (uint)nodeIndex);
 
                if (node.TryGetChild(c, out int childIndex))
                {
                    // We were able to extend the current match. If this node contains a potential match, remember that.
                    nodeIndex = childIndex;
 
                    Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
                    int matchLength = Unsafe.Add(ref nodes, (uint)nodeIndex).MatchLength;
                    if (matchLength != 0)
                    {
                        // Any result we find from here on out may only be lower (longer match with a start closer to the beginning of the input).
                        Debug.Assert(result == -1 || result >= i + 1 - matchLength);
                        result = i + 1 - matchLength;
                    }
 
                    i++;
                    goto Loop;
                }
 
                if (nodeIndex == 0)
                {
                    // We are back at the root node and none of the values start with the current character.
                    if (result >= 0)
                    {
                        // If we've already found a match, we can't find an earlier one anymore. This is the result
                        goto Return;
                    }
 
                    // Go back to searching for the next possible starting character.
                    i++;
                    goto FastScan;
                }
 
                // Follow the next suffix link.
                nodeIndex = node.SuffixLink;
 
                if (nodeIndex < 0)
                {
                    // A node with a suffix link of -1 indicates a match, see AhoCorasickBuilder.AddSuffixLinks.
                    Debug.Assert(nodeIndex == -1);
                    Debug.Assert(result >= 0);
                    goto Return;
                }
 
                // Try to match the current character again at the suffix link node.
            }
 
        Return:
            return result;
        }
 
        private static void SurrogateToUpperNLS(char h, char l, out char hr, out char lr)
        {
            Debug.Assert(char.IsHighSurrogate(h));
            Debug.Assert(char.IsLowSurrogate(l));
 
            ReadOnlySpan<char> chars = [h, l];
            Span<char> destination = stackalloc char[2];
 
            int written = Ordinal.ToUpperOrdinal(chars, destination);
            Debug.Assert(written == 2);
 
            hr = destination[0];
            lr = destination[1];
 
            Debug.Assert(char.IsHighSurrogate(hr));
            Debug.Assert(char.IsLowSurrogate(lr));
        }
 
        public interface IFastScan { }
 
        public readonly struct IndexOfAnyAsciiFastScan : IFastScan { }
 
        public readonly struct NoFastScan : IFastScan { }
    }
}
File: src\libraries\System.Private.CoreLib\src\System\SearchValues\Strings\Helpers\AhoCorasick.cs	Web Access
Project: src\src\coreclr\System.Private.CoreLib\System.Private.CoreLib.csproj (System.Private.CoreLib)