File: src\libraries\System.Private.CoreLib\src\System\SpanHelpers.Char.cs
Web Access
Project: src\src\coreclr\System.Private.CoreLib\System.Private.CoreLib.csproj (System.Private.CoreLib)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
 
namespace System
{
    internal static partial class SpanHelpers // .Char
    {
        public static int IndexOf(ref char searchSpace, int searchSpaceLength, ref char value, int valueLength)
        {
            Debug.Assert(searchSpaceLength >= 0);
            Debug.Assert(valueLength >= 0);
 
            if (valueLength == 0)
                return 0;  // A zero-length sequence is always treated as "found" at the start of the search space.
 
            int valueTailLength = valueLength - 1;
            if (valueTailLength == 0)
            {
                // for single-char values use plain IndexOf
                return IndexOfChar(ref searchSpace, value, searchSpaceLength);
            }
 
            nint offset = 0;
            char valueHead = value;
            int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
            if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<ushort>.Count)
            {
                goto SEARCH_TWO_CHARS;
            }
 
            ref byte valueTail = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref value, 1));
            int remainingSearchSpaceLength = searchSpaceMinusValueTailLength;
 
            while (remainingSearchSpaceLength > 0)
            {
                // Do a quick search for the first element of "value".
                // Using the non-packed variant as the input is short and would not benefit from the packed implementation.
                int relativeIndex = NonPackedIndexOfChar(ref Unsafe.Add(ref searchSpace, offset), valueHead, remainingSearchSpaceLength);
                if (relativeIndex < 0)
                    break;
 
                remainingSearchSpaceLength -= relativeIndex;
                offset += relativeIndex;
 
                if (remainingSearchSpaceLength <= 0)
                    break;  // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
 
                // Found the first element of "value". See if the tail matches.
                if (SequenceEqual(
                        ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + 1)),
                        ref valueTail,
                        (nuint)(uint)valueTailLength * 2))
                {
                    return (int)offset;  // The tail matched. Return a successful find.
                }
 
                remainingSearchSpaceLength--;
                offset++;
            }
            return -1;
 
            // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
            // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
        SEARCH_TWO_CHARS:
            if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512<ushort>.Count >= 0)
            {
                // Find the last unique (which is not equal to ch1) character
                // the algorithm is fine if both are equal, just a little bit less efficient
                ushort ch2Val = Unsafe.Add(ref value, valueTailLength);
                nint ch1ch2Distance = (nint)(uint)valueTailLength;
                while (ch2Val == valueHead && ch1ch2Distance > 1)
                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
 
                Vector512<ushort> ch1 = Vector512.Create((ushort)valueHead);
                Vector512<ushort> ch2 = Vector512.Create(ch2Val);
 
                nint searchSpaceMinusValueTailLengthAndVector =
                    searchSpaceMinusValueTailLength - (nint)Vector512<ushort>.Count;
 
                do
                {
                    // Make sure we don't go out of bounds
                    Debug.Assert(offset + ch1ch2Distance + Vector512<ushort>.Count <= searchSpaceLength);
 
                    Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                    Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
                    Vector512<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                    // Early out: cmpAnd is all zeros
                    if (cmpAnd != Vector512<byte>.Zero)
                    {
                        goto CANDIDATE_FOUND;
                    }
 
                LOOP_FOOTER:
                    offset += Vector512<ushort>.Count;
 
                    if (offset == searchSpaceMinusValueTailLength)
                        return -1;
 
                    // Overlap with the current chunk for trailing elements
                    if (offset > searchSpaceMinusValueTailLengthAndVector)
                        offset = searchSpaceMinusValueTailLengthAndVector;
 
                    continue;
 
                CANDIDATE_FOUND:
                    ulong mask = cmpAnd.ExtractMostSignificantBits();
                    do
                    {
                        int bitPos = BitOperations.TrailingZeroCount(mask);
                        // div by 2 (shr) because we work with 2-byte chars
                        nint charPos = (nint)((uint)bitPos / 2);
                        if (valueLength == 2 || // we already matched two chars
                            SequenceEqual(
                                ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
                                ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
                        {
                            return (int)(offset + charPos);
                        }
 
                        // Clear two the lowest set bits
                        if (Bmi1.X64.IsSupported)
                            mask = Bmi1.X64.ResetLowestSetBit(Bmi1.X64.ResetLowestSetBit(mask));
                        else
                            mask &= ~(ulong)((ulong)0b11 << bitPos);
                    } while (mask != 0);
                    goto LOOP_FOOTER;
 
                } while (true);
            }
            else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
            {
                // Find the last unique (which is not equal to ch1) character
                // the algorithm is fine if both are equal, just a little bit less efficient
                ushort ch2Val = Unsafe.Add(ref value, valueTailLength);
                nint ch1ch2Distance = valueTailLength;
                while (ch2Val == valueHead && ch1ch2Distance > 1)
                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
 
                Vector256<ushort> ch1 = Vector256.Create((ushort)valueHead);
                Vector256<ushort> ch2 = Vector256.Create(ch2Val);
 
                nint searchSpaceMinusValueTailLengthAndVector =
                    searchSpaceMinusValueTailLength - (nint)Vector256<ushort>.Count;
 
                do
                {
                    // Make sure we don't go out of bounds
                    Debug.Assert(offset + ch1ch2Distance + Vector256<ushort>.Count <= searchSpaceLength);
 
                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
                    Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                    // Early out: cmpAnd is all zeros
                    if (cmpAnd != Vector256<byte>.Zero)
                    {
                        goto CANDIDATE_FOUND;
                    }
 
                LOOP_FOOTER:
                    offset += Vector256<ushort>.Count;
 
                    if (offset == searchSpaceMinusValueTailLength)
                        return -1;
 
                    // Overlap with the current chunk for trailing elements
                    if (offset > searchSpaceMinusValueTailLengthAndVector)
                        offset = searchSpaceMinusValueTailLengthAndVector;
 
                    continue;
 
                CANDIDATE_FOUND:
                    uint mask = cmpAnd.ExtractMostSignificantBits();
                    do
                    {
                        nint charPos = (nint)(uint.TrailingZeroCount(mask) / sizeof(ushort));
                        if (valueLength == 2 || // we already matched two chars
                            SequenceEqual(
                                ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
                                ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
                        {
                            return (int)(offset + charPos);
                        }
 
                        // Clear two the lowest set bits
                        mask = BitOperations.ResetLowestSetBit(BitOperations.ResetLowestSetBit(mask));
                    } while (mask != 0);
                    goto LOOP_FOOTER;
 
                } while (true);
            }
            else // 128bit vector path (SSE2 or AdvSimd)
            {
                // Find the last unique (which is not equal to ch1) character
                // the algorithm is fine if both are equal, just a little bit less efficient
                ushort ch2Val = Unsafe.Add(ref value, valueTailLength);
                nint ch1ch2Distance = valueTailLength;
                while (ch2Val == valueHead && ch1ch2Distance > 1)
                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
 
                Vector128<ushort> ch1 = Vector128.Create((ushort)valueHead);
                Vector128<ushort> ch2 = Vector128.Create(ch2Val);
 
                nint searchSpaceMinusValueTailLengthAndVector =
                    searchSpaceMinusValueTailLength - (nint)Vector128<ushort>.Count;
 
                do
                {
                    // Make sure we don't go out of bounds
                    Debug.Assert(offset + ch1ch2Distance + Vector128<ushort>.Count <= searchSpaceLength);
 
                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
                    Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                    // Early out: cmpAnd is all zeros
                    if (cmpAnd != Vector128<byte>.Zero)
                    {
                        goto CANDIDATE_FOUND;
                    }
 
                LOOP_FOOTER:
                    offset += Vector128<ushort>.Count;
 
                    if (offset == searchSpaceMinusValueTailLength)
                        return -1;
 
                    // Overlap with the current chunk for trailing elements
                    if (offset > searchSpaceMinusValueTailLengthAndVector)
                        offset = searchSpaceMinusValueTailLengthAndVector;
 
                    continue;
 
                CANDIDATE_FOUND:
                    uint mask = cmpAnd.ExtractMostSignificantBits();
                    do
                    {
                        nint charPos = (nint)(uint.TrailingZeroCount(mask) / sizeof(ushort));
                        if (valueLength == 2 || // we already matched two chars
                            SequenceEqual(
                                ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
                                ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
                        {
                            return (int)(offset + charPos);
                        }
 
                        // Clear two lowest set bits
                        mask = BitOperations.ResetLowestSetBit(BitOperations.ResetLowestSetBit(mask));
                    } while (mask != 0);
                    goto LOOP_FOOTER;
 
                } while (true);
            }
        }
 
        public static int LastIndexOf(ref char searchSpace, int searchSpaceLength, ref char value, int valueLength)
        {
            Debug.Assert(searchSpaceLength >= 0);
            Debug.Assert(valueLength >= 0);
 
            if (valueLength == 0)
                return searchSpaceLength;  // A zero-length sequence is always treated as "found" at the end of the search space.
 
            int valueTailLength = valueLength - 1;
            if (valueTailLength == 0)
                return LastIndexOfValueType(ref Unsafe.As<char, short>(ref searchSpace), (short)value, searchSpaceLength); // for single-char values use plain LastIndexOf
 
            int offset = 0;
            char valueHead = value;
            int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
            if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<ushort>.Count)
            {
                goto SEARCH_TWO_CHARS;
            }
 
            ref byte valueTail = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref value, 1));
 
            while (true)
            {
                Debug.Assert(0 <= offset && offset <= searchSpaceLength); // Ensures no deceptive underflows in the computation of "remainingSearchSpaceLength".
                int remainingSearchSpaceLength = searchSpaceLength - offset - valueTailLength;
                if (remainingSearchSpaceLength <= 0)
                    break;  // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
 
                // Do a quick search for the first element of "value".
                int relativeIndex = LastIndexOfValueType(ref Unsafe.As<char, short>(ref searchSpace), (short)valueHead, remainingSearchSpaceLength);
                if (relativeIndex == -1)
                    break;
 
                // Found the first element of "value". See if the tail matches.
                if (SequenceEqual(
                        ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)),
                        ref valueTail, (nuint)(uint)valueTailLength * 2))
                {
                    return relativeIndex; // The tail matched. Return a successful find.
                }
 
                offset += remainingSearchSpaceLength - relativeIndex;
            }
            return -1;
 
            // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
            // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
        SEARCH_TWO_CHARS:
            if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector512<ushort>.Count)
            {
                offset = searchSpaceMinusValueTailLength - Vector512<ushort>.Count;
 
                // Find the last unique (which is not equal to ch1) char
                // the algorithm is fine if both are equal, just a little bit less efficient
                char ch2Val = Unsafe.Add(ref value, valueTailLength);
                int ch1ch2Distance = valueTailLength;
                while (ch2Val == valueHead && ch1ch2Distance > 1)
                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
 
                Vector512<ushort> ch1 = Vector512.Create((ushort)valueHead);
                Vector512<ushort> ch2 = Vector512.Create((ushort)ch2Val);
 
                do
                {
 
                    Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
                    Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                    Vector512<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                    // Early out: cmpAnd is all zeros
                    if (cmpAnd != Vector512<byte>.Zero)
                    {
                        ulong mask = cmpAnd.ExtractMostSignificantBits();
                        do
                        {
                            // unlike IndexOf, here we use LZCNT to process matches starting from the end
                            int bitPos = 62 - BitOperations.LeadingZeroCount(mask);
                            int charPos = (int)((uint)bitPos / 2);
 
                            if (valueLength == 2 || // we already matched two chars
                                SequenceEqual(
                                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
                                    ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
                            {
                                return charPos + offset;
                            }
                            mask &= ~(ulong)((ulong)0b11 << bitPos); // clear two highest set bits.
                        } while (mask != 0);
                    }
 
                    offset -= Vector512<ushort>.Count;
                    if (offset == -Vector512<ushort>.Count)
                        return -1;
                    // Overlap with the current chunk if there is not enough room for the next one
                    if (offset < 0)
                        offset = 0;
                } while (true);
            }
            else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256<ushort>.Count)
            {
                offset = searchSpaceMinusValueTailLength - Vector256<ushort>.Count;
 
                // Find the last unique (which is not equal to ch1) char
                // the algorithm is fine if both are equal, just a little bit less efficient
                char ch2Val = Unsafe.Add(ref value, valueTailLength);
                int ch1ch2Distance = valueTailLength;
                while (ch2Val == valueHead && ch1ch2Distance > 1)
                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
 
                Vector256<ushort> ch1 = Vector256.Create((ushort)valueHead);
                Vector256<ushort> ch2 = Vector256.Create((ushort)ch2Val);
 
                do
                {
 
                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                    Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                    // Early out: cmpAnd is all zeros
                    if (cmpAnd != Vector256<byte>.Zero)
                    {
                        uint mask = cmpAnd.ExtractMostSignificantBits();
                        do
                        {
                            // unlike IndexOf, here we use LZCNT to process matches starting from the end
                            int bitPos = 30 - BitOperations.LeadingZeroCount(mask);
                            int charPos = (int)((uint)bitPos / 2);
 
                            if (valueLength == 2 || // we already matched two chars
                                SequenceEqual(
                                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
                                    ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
                            {
                                return charPos + offset;
                            }
                            mask &= ~(uint)(0b11 << bitPos); // clear two highest set bits.
                        } while (mask != 0);
                    }
 
                    offset -= Vector256<ushort>.Count;
                    if (offset == -Vector256<ushort>.Count)
                        return -1;
                    // Overlap with the current chunk if there is not enough room for the next one
                    if (offset < 0)
                        offset = 0;
                } while (true);
            }
            else // 128bit vector path (SSE2 or AdvSimd)
            {
                offset = searchSpaceMinusValueTailLength - Vector128<ushort>.Count;
 
                // Find the last unique (which is not equal to ch1) char
                // the algorithm is fine if both are equal, just a little bit less efficient
                char ch2Val = Unsafe.Add(ref value, valueTailLength);
                int ch1ch2Distance = valueTailLength;
                while (ch2Val == value && ch1ch2Distance > 1)
                    ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
 
                Vector128<ushort> ch1 = Vector128.Create((ushort)value);
                Vector128<ushort> ch2 = Vector128.Create((ushort)ch2Val);
 
                do
                {
                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                    Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                    // Early out: cmpAnd is all zeros
                    // it's especially important for ARM where ExtractMostSignificantBits is not cheap
                    if (cmpAnd != Vector128<byte>.Zero)
                    {
                        uint mask = cmpAnd.ExtractMostSignificantBits();
                        do
                        {
                            // unlike IndexOf, here we use LZCNT to process matches starting from the end
                            int bitPos = 30 - BitOperations.LeadingZeroCount(mask);
                            int charPos = (int)((uint)bitPos / 2);
 
                            if (valueLength == 2 || // we already matched two chars
                                SequenceEqual(
                                    ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, offset + charPos)),
                                    ref Unsafe.As<char, byte>(ref value), (nuint)(uint)valueLength * 2))
                            {
                                return charPos + offset;
                            }
                            mask &= ~(uint)(0b11 << bitPos); // clear two the highest set bits.
                        } while (mask != 0);
                    }
 
                    offset -= Vector128<ushort>.Count;
                    if (offset == -Vector128<ushort>.Count)
                        return -1;
                    // Overlap with the current chunk if there is not enough room for the next one
                    if (offset < 0)
                        offset = 0;
                } while (true);
            }
        }
 
        public static unsafe int SequenceCompareTo(ref char first, int firstLength, ref char second, int secondLength)
        {
            Debug.Assert(firstLength >= 0);
            Debug.Assert(secondLength >= 0);
 
            int lengthDelta = firstLength - secondLength;
 
            if (Unsafe.AreSame(ref first, ref second))
                goto Equal;
 
            nuint minLength = (nuint)(((uint)firstLength < (uint)secondLength) ? (uint)firstLength : (uint)secondLength);
            nuint i = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations
 
            if (minLength >= (nuint)(sizeof(nuint) / sizeof(char)))
            {
                if (Vector.IsHardwareAccelerated && minLength >= (nuint)Vector<ushort>.Count)
                {
                    nuint nLength = minLength - (nuint)Vector<ushort>.Count;
                    do
                    {
                        if (Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref first, (nint)i))) !=
                            Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref second, (nint)i))))
                        {
                            break;
                        }
                        i += (nuint)Vector<ushort>.Count;
                    }
                    while (nLength >= i);
                }
 
                while (minLength >= (i + (nuint)(sizeof(nuint) / sizeof(char))))
                {
                    if (Unsafe.ReadUnaligned<nuint>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref first, (nint)i))) !=
                        Unsafe.ReadUnaligned<nuint>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref second, (nint)i))))
                    {
                        break;
                    }
                    i += (nuint)(sizeof(nuint) / sizeof(char));
                }
            }
 
#if TARGET_64BIT
            if (minLength >= (i + sizeof(int) / sizeof(char)))
            {
                if (Unsafe.ReadUnaligned<int>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref first, (nint)i))) ==
                    Unsafe.ReadUnaligned<int>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref second, (nint)i))))
                {
                    i += sizeof(int) / sizeof(char);
                }
            }
#endif
 
            while (i < minLength)
            {
                int result = Unsafe.Add(ref first, (nint)i).CompareTo(Unsafe.Add(ref second, (nint)i));
                if (result != 0)
                    return result;
                i += 1;
            }
 
        Equal:
            return lengthDelta;
        }
 
        // IndexOfNullCharacter processes memory in aligned chunks, and thus it won't crash even if it accesses memory beyond the null terminator.
        // This behavior is an implementation detail of the runtime and callers outside System.Private.CoreLib must not depend on it.
        public static unsafe int IndexOfNullCharacter(char* searchSpace)
        {
            const char value = '\0';
            const int length = int.MaxValue;
 
            nint offset = 0;
            nint lengthToExamine = length;
 
            if (((int)searchSpace & 1) != 0)
            {
                // Input isn't char aligned, we won't be able to align it to a Vector
            }
            else if (Vector128.IsHardwareAccelerated)
            {
                // Avx2 branch also operates on Sse2 sizes, so check is combined.
                // Needs to be double length to allow us to align the data first.
                lengthToExamine = UnalignedCountVector128(searchSpace);
            }
 
        SequentialScan:
            // In the non-vector case lengthToExamine is the total length.
            // In the vector case lengthToExamine first aligns to Vector,
            // then in a second pass after the Vector lengths is the
            // remaining data that is shorter than a Vector length.
            while (lengthToExamine >= 4)
            {
                if (value == searchSpace[offset])
                    goto Found;
                if (value == searchSpace[offset + 1])
                    goto Found1;
                if (value == searchSpace[offset + 2])
                    goto Found2;
                if (value == searchSpace[offset + 3])
                    goto Found3;
 
                offset += 4;
                lengthToExamine -= 4;
            }
 
            while (lengthToExamine > 0)
            {
                if (value == searchSpace[offset])
                    goto Found;
 
                offset++;
                lengthToExamine--;
            }
 
            // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow
            // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
            if (Vector512.IsHardwareAccelerated)
            {
                if (offset < length)
                {
                    Debug.Assert(length - offset >= Vector128<ushort>.Count);
                    if (((nint)(searchSpace + (nint)offset) & (nint)(Vector256<byte>.Count - 1)) != 0)
                    {
                        // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
                        // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256,
                        // before moving to processing Vector256.
 
                        // This ensures we do not fault across memory pages
                        // while searching for an end of string. Specifically that this assumes that the length is either correct
                        // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an
                        // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found,
                        // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather
                        // than ever causing an AV.
                        Vector128<ushort> search = *(Vector128<ushort>*)(searchSpace + (nuint)offset);
 
                        // Same method as below
                        uint matches = Vector128.Equals(Vector128<ushort>.Zero, search).AsByte().ExtractMostSignificantBits();
                        if (matches == 0)
                        {
                            // Zero flags set so no matches
                            offset += Vector128<ushort>.Count;
                        }
                        else
                        {
                            // Find bitflag offset of first match and add to current offset
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        }
                    }
                    if (((nint)(searchSpace + (nint)offset) & (nint)(Vector512<byte>.Count - 1)) != 0)
                    {
                        // Not currently aligned to Vector512 (is aligned to Vector256); this can cause a problem for searches
                        // with no upper bound e.g. String.wcslen. Start with a check on Vector256 to align to Vector512,
                        // before moving to processing Vector256.
 
                        // This ensures we do not fault across memory pages
                        // while searching for an end of string. Specifically that this assumes that the length is either correct
                        // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an
                        // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found,
                        // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather
                        // than ever causing an AV.
                        Vector256<ushort> search = *(Vector256<ushort>*)(searchSpace + (nuint)offset);
 
                        // Same method as below
                        uint matches = Vector256.Equals(Vector256<ushort>.Zero, search).AsByte().ExtractMostSignificantBits();
                        if (matches == 0)
                        {
                            // Zero flags set so no matches
                            offset += Vector256<ushort>.Count;
                        }
                        else
                        {
                            // Find bitflag offset of first match and add to current offset
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        }
                    }
 
                    lengthToExamine = GetCharVector512SpanLength(offset, length);
                    if (lengthToExamine > 0)
                    {
                        do
                        {
                            Debug.Assert(lengthToExamine >= Vector512<ushort>.Count);
 
                            Vector512<ushort> search = *(Vector512<ushort>*)(searchSpace + (nuint)offset);
 
                            // AVX-512 returns comparison results in a mask register, so we want to optimize
                            // the core check to simply be an "none match" check. This will slightly increase
                            // the cost for the early match case, but greatly improves perf otherwise.
 
                            if (!Vector512.EqualsAny(search, Vector512<ushort>.Zero))
                            {
                                // Zero flags set so no matches
                                offset += Vector512<ushort>.Count;
                                lengthToExamine -= Vector512<ushort>.Count;
                                continue;
                            }
 
                            // Note that ExtractMostSignificantBits has converted the equal vector elements into a set of bit flags,
                            // So the bit position in 'matches' corresponds to the element offset.
                            //
                            // Find bitflag offset of first match and add to current offset,
                            // flags are in bytes so divide for chars
                            ulong matches = Vector512.Equals(search, Vector512<ushort>.Zero).ExtractMostSignificantBits();
                            return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
                        } while (lengthToExamine > 0);
                    }
 
                    lengthToExamine = GetCharVector256SpanLength(offset, length);
                    if (lengthToExamine > 0)
                    {
                        Debug.Assert(lengthToExamine >= Vector256<ushort>.Count);
 
                        Vector256<ushort> search = *(Vector256<ushort>*)(searchSpace + (nuint)offset);
 
                        // Same method as above
                        uint matches = Vector256.Equals(Vector256<ushort>.Zero, search).AsByte().ExtractMostSignificantBits();
                        if (matches == 0)
                        {
                            // Zero flags set so no matches
                            offset += Vector256<ushort>.Count;
                            // Don't need to change lengthToExamine here as we don't use its current value again.
                        }
                        else
                        {
                            // Find bitflag offset of first match and add to current offset,
                            // flags are in bytes so divide for chars
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        }
                    }
 
                    lengthToExamine = GetCharVector128SpanLength(offset, length);
                    if (lengthToExamine > 0)
                    {
                        Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);
 
                        Vector128<ushort> search = *(Vector128<ushort>*)(searchSpace + (nuint)offset);
 
                        // Same method as above
                        uint matches = Vector128.Equals(Vector128<ushort>.Zero, search).AsByte().ExtractMostSignificantBits();
                        if (matches == 0)
                        {
                            // Zero flags set so no matches
                            offset += Vector128<ushort>.Count;
                            // Don't need to change lengthToExamine here as we don't use its current value again.
                        }
                        else
                        {
                            // Find bitflag offset of first match and add to current offset,
                            // flags are in bytes so divide for chars
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        }
                    }
 
                    if (offset < length)
                    {
                        lengthToExamine = length - offset;
                        goto SequentialScan;
                    }
                }
            }
            else if (Vector256.IsHardwareAccelerated)
            {
                if (offset < length)
                {
                    Debug.Assert(length - offset >= Vector128<ushort>.Count);
                    if (((nint)(searchSpace + (nint)offset) & (nint)(Vector256<byte>.Count - 1)) != 0)
                    {
                        // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
                        // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256,
                        // before moving to processing Vector256.
 
                        // This ensures we do not fault across memory pages
                        // while searching for an end of string. Specifically that this assumes that the length is either correct
                        // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an
                        // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found,
                        // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather
                        // than ever causing an AV.
                        Vector128<ushort> search = *(Vector128<ushort>*)(searchSpace + (nuint)offset);
 
                        // Same method as below
                        uint matches = Vector128.Equals(Vector128<ushort>.Zero, search).AsByte().ExtractMostSignificantBits();
                        if (matches == 0)
                        {
                            // Zero flags set so no matches
                            offset += Vector128<ushort>.Count;
                        }
                        else
                        {
                            // Find bitflag offset of first match and add to current offset
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        }
                    }
 
                    lengthToExamine = GetCharVector256SpanLength(offset, length);
                    if (lengthToExamine > 0)
                    {
                        do
                        {
                            Debug.Assert(lengthToExamine >= Vector256<ushort>.Count);
 
                            Vector256<ushort> search = *(Vector256<ushort>*)(searchSpace + (nuint)offset);
                            uint matches = Vector256.Equals(Vector256<ushort>.Zero, search).AsByte().ExtractMostSignificantBits();
                            // Note that MoveMask has converted the equal vector elements into a set of bit flags,
                            // So the bit position in 'matches' corresponds to the element offset.
                            if (matches == 0)
                            {
                                // Zero flags set so no matches
                                offset += Vector256<ushort>.Count;
                                lengthToExamine -= Vector256<ushort>.Count;
                                continue;
                            }
 
                            // Find bitflag offset of first match and add to current offset,
                            // flags are in bytes so divide for chars
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        } while (lengthToExamine > 0);
                    }
 
                    lengthToExamine = GetCharVector128SpanLength(offset, length);
                    if (lengthToExamine > 0)
                    {
                        Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);
 
                        Vector128<ushort> search = *(Vector128<ushort>*)(searchSpace + (nuint)offset);
 
                        // Same method as above
                        uint matches = Vector128.Equals(Vector128<ushort>.Zero, search).AsByte().ExtractMostSignificantBits();
                        if (matches == 0)
                        {
                            // Zero flags set so no matches
                            offset += Vector128<ushort>.Count;
                            // Don't need to change lengthToExamine here as we don't use its current value again.
                        }
                        else
                        {
                            // Find bitflag offset of first match and add to current offset,
                            // flags are in bytes so divide for chars
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        }
                    }
 
                    if (offset < length)
                    {
                        lengthToExamine = length - offset;
                        goto SequentialScan;
                    }
                }
            }
            else if (Vector128.IsHardwareAccelerated)
            {
                if (offset < length)
                {
                    Debug.Assert(length - offset >= Vector128<ushort>.Count);
 
                    lengthToExamine = GetCharVector128SpanLength(offset, length);
                    if (lengthToExamine > 0)
                    {
                        do
                        {
                            Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);
 
                            Vector128<ushort> search = *(Vector128<ushort>*)(searchSpace + (nuint)offset);
 
                            // Same method as above
                            Vector128<ushort> compareResult = Vector128.Equals(Vector128<ushort>.Zero, search);
                            if (compareResult == Vector128<ushort>.Zero)
                            {
                                // Zero flags set so no matches
                                offset += Vector128<ushort>.Count;
                                lengthToExamine -= Vector128<ushort>.Count;
                                continue;
                            }
 
                            // Find bitflag offset of first match and add to current offset,
                            // flags are in bytes so divide for chars
                            uint matches = compareResult.AsByte().ExtractMostSignificantBits();
                            return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
                        } while (lengthToExamine > 0);
                    }
 
                    if (offset < length)
                    {
                        lengthToExamine = length - offset;
                        goto SequentialScan;
                    }
                }
            }
 
            ThrowMustBeNullTerminatedString();
        Found3:
            return (int)(offset + 3);
        Found2:
            return (int)(offset + 2);
        Found1:
            return (int)(offset + 1);
        Found:
            return (int)(offset);
        }
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static int LocateFirstFoundChar(ulong match)
            => BitOperations.TrailingZeroCount(match) >> 4;
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static nint GetCharVector128SpanLength(nint offset, nint length)
            => (length - offset) & ~(Vector128<ushort>.Count - 1);
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static nint GetCharVector256SpanLength(nint offset, nint length)
            => (length - offset) & ~(Vector256<ushort>.Count - 1);
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static nint GetCharVector512SpanLength(nint offset, nint length)
            => (length - offset) & ~(Vector512<ushort>.Count - 1);
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static unsafe nint UnalignedCountVector128(char* searchSpace)
        {
            const int ElementsPerByte = sizeof(ushort) / sizeof(byte);
            return (nint)(uint)(-(int)searchSpace / ElementsPerByte) & (Vector128<ushort>.Count - 1);
        }
 
        public static void Reverse(ref char buf, nuint length)
        {
            Debug.Assert(length > 1);
 
            nint remainder = (nint)length;
            nint offset = 0;
 
            if (Vector512.IsHardwareAccelerated && remainder >= Vector512<ushort>.Count * 2)
            {
                nint lastOffset = remainder - Vector512<ushort>.Count;
                do
                {
                    ref ushort first = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, offset));
                    ref ushort last = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, lastOffset));
 
                    Vector512<ushort> tempFirst = Vector512.LoadUnsafe(ref first);
                    Vector512<ushort> tempLast = Vector512.LoadUnsafe(ref last);
 
                    // Shuffle to reverse each vector:
                    //     +-------------------------------+
                    //     | A | B | C | D | E | F | G | H |
                    //     +-------------------------------+
                    //          --->
                    //     +-------------------------------+
                    //     | H | G | F | E | D | C | B | A |
                    //     +-------------------------------+
                    tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create((ushort)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
                        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
                    tempLast = Vector512.Shuffle(tempLast, Vector512.Create((ushort)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
                        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
 
                    // Store the reversed vectors
                    tempLast.StoreUnsafe(ref first);
                    tempFirst.StoreUnsafe(ref last);
 
                    offset += Vector512<ushort>.Count;
                    lastOffset -= Vector512<ushort>.Count;
                } while (lastOffset >= offset);
 
                remainder = (lastOffset + Vector512<ushort>.Count - offset);
            }
            // overlapping has a positive performance benefit around 24 elements
            else if (Avx2.IsSupported && remainder >= (nint)(Vector256<ushort>.Count * 1.5))
            {
                Vector256<byte> reverseMask = Vector256.Create(
                    (byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, // first 128-bit lane
                    14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); // second 128-bit lane
 
                nint lastOffset = remainder - Vector256<ushort>.Count;
                do
                {
                    ref byte first = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref buf, offset));
                    ref byte last = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref buf, lastOffset));
 
                    Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref first);
                    Vector256<byte> tempLast = Vector256.LoadUnsafe(ref last);
 
                    // Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
                    // Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
                    //     +---------------------------------------------------------------+
                    //     | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
                    //     +---------------------------------------------------------------+
                    //         Shuffle --->
                    //     +---------------------------------------------------------------+
                    //     | H | G | F | E | D | C | B | A | P | O | N | M | L | K | J | I |
                    //     +---------------------------------------------------------------+
                    //         Permute --->
                    //     +---------------------------------------------------------------+
                    //     | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
                    //     +---------------------------------------------------------------+
                    tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
                    tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
                    tempLast = Avx2.Shuffle(tempLast, reverseMask);
                    tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);
 
                    // Store the reversed vectors
                    tempLast.StoreUnsafe(ref first);
                    tempFirst.StoreUnsafe(ref last);
 
                    offset += Vector256<ushort>.Count;
                    lastOffset -= Vector256<ushort>.Count;
                } while (lastOffset >= offset);
 
                remainder = (lastOffset + Vector256<ushort>.Count - offset);
            }
            else if (Vector128.IsHardwareAccelerated && remainder >= Vector128<ushort>.Count * 2)
            {
                nint lastOffset = remainder - Vector128<ushort>.Count;
                do
                {
                    ref ushort first = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, offset));
                    ref ushort last = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, lastOffset));
 
                    Vector128<ushort> tempFirst = Vector128.LoadUnsafe(ref first);
                    Vector128<ushort> tempLast = Vector128.LoadUnsafe(ref last);
 
                    // Shuffle to reverse each vector:
                    //     +-------------------------------+
                    //     | A | B | C | D | E | F | G | H |
                    //     +-------------------------------+
                    //          --->
                    //     +-------------------------------+
                    //     | H | G | F | E | D | C | B | A |
                    //     +-------------------------------+
                    tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((ushort)7, 6, 5, 4, 3, 2, 1, 0));
                    tempLast = Vector128.Shuffle(tempLast, Vector128.Create((ushort)7, 6, 5, 4, 3, 2, 1, 0));
 
                    // Store the reversed vectors
                    tempLast.StoreUnsafe(ref first);
                    tempFirst.StoreUnsafe(ref last);
 
                    offset += Vector128<ushort>.Count;
                    lastOffset -= Vector128<ushort>.Count;
                } while (lastOffset >= offset);
 
                remainder = (lastOffset + Vector128<ushort>.Count - offset);
            }
 
            // Store any remaining values one-by-one
            if (remainder > 1)
            {
                ReverseInner(ref Unsafe.Add(ref buf, offset), (nuint)remainder);
            }
        }
    }
}