|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Buffers.Binary;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace System
{
internal static partial class SpanHelpers // .Byte
{
public static int IndexOf(ref byte searchSpace, int searchSpaceLength, ref byte value, int valueLength)
{
Debug.Assert(searchSpaceLength >= 0);
Debug.Assert(valueLength >= 0);
if (valueLength == 0)
return 0; // A zero-length sequence is always treated as "found" at the start of the search space.
int valueTailLength = valueLength - 1;
if (valueTailLength == 0)
return IndexOfValueType(ref searchSpace, value, searchSpaceLength); // for single-byte values use plain IndexOf
nint offset = 0;
byte valueHead = value;
int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<byte>.Count)
{
goto SEARCH_TWO_BYTES;
}
ref byte valueTail = ref Unsafe.Add(ref value, 1);
int remainingSearchSpaceLength = searchSpaceMinusValueTailLength;
while (remainingSearchSpaceLength > 0)
{
// Do a quick search for the first element of "value".
int relativeIndex = IndexOfValueType(ref Unsafe.Add(ref searchSpace, offset), valueHead, remainingSearchSpaceLength);
if (relativeIndex < 0)
break;
remainingSearchSpaceLength -= relativeIndex;
offset += relativeIndex;
if (remainingSearchSpaceLength <= 0)
break; // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
// Found the first element of "value". See if the tail matches.
if (SequenceEqual(
ref Unsafe.Add(ref searchSpace, offset + 1),
ref valueTail, (nuint)(uint)valueTailLength)) // The (nuint)-cast is necessary to pick the correct overload
return (int)offset; // The tail matched. Return a successful find.
remainingSearchSpaceLength--;
offset++;
}
return -1;
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_BYTES:
if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512<byte>.Count >= 0)
{
// Find the last unique (which is not equal to ch1) byte
// the algorithm is fine if both are equal, just a little bit less efficient
byte ch2Val = Unsafe.Add(ref value, valueTailLength);
nint ch1ch2Distance = (nint)(uint)valueTailLength;
while (ch2Val == value && ch1ch2Distance > 1)
ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
Vector512<byte> ch1 = Vector512.Create(value);
Vector512<byte> ch2 = Vector512.Create(ch2Val);
nint searchSpaceMinusValueTailLengthAndVector =
searchSpaceMinusValueTailLength - (nint)Vector512<byte>.Count;
do
{
Debug.Assert(offset >= 0);
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector512<byte>.Count <= searchSpaceLength);
Vector512<byte> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector512<byte> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector512<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
if (cmpAnd != Vector512<byte>.Zero)
{
goto CANDIDATE_FOUND;
}
LOOP_FOOTER:
offset += Vector512<byte>.Count;
if (offset == searchSpaceMinusValueTailLength)
return -1;
// Overlap with the current chunk for trailing elements
if (offset > searchSpaceMinusValueTailLengthAndVector)
offset = searchSpaceMinusValueTailLengthAndVector;
continue;
CANDIDATE_FOUND:
ulong mask = cmpAnd.ExtractMostSignificantBits();
do
{
int bitPos = BitOperations.TrailingZeroCount(mask);
if (valueLength == 2 || // we already matched two bytes
SequenceEqual(
ref Unsafe.Add(ref searchSpace, offset + bitPos),
ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
{
return (int)(offset + bitPos);
}
mask = BitOperations.ResetLowestSetBit(mask); // Clear the lowest set bit
} while (mask != 0);
goto LOOP_FOOTER;
} while (true);
}
else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<byte>.Count >= 0)
{
// Find the last unique (which is not equal to ch1) byte
// the algorithm is fine if both are equal, just a little bit less efficient
byte ch2Val = Unsafe.Add(ref value, valueTailLength);
nint ch1ch2Distance = valueTailLength;
while (ch2Val == value && ch1ch2Distance > 1)
ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
Vector256<byte> ch1 = Vector256.Create(value);
Vector256<byte> ch2 = Vector256.Create(ch2Val);
nint searchSpaceMinusValueTailLengthAndVector =
searchSpaceMinusValueTailLength - (nint)Vector256<byte>.Count;
do
{
Debug.Assert(offset >= 0);
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector256<byte>.Count <= searchSpaceLength);
Vector256<byte> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector256<byte> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
if (cmpAnd != Vector256<byte>.Zero)
{
goto CANDIDATE_FOUND;
}
LOOP_FOOTER:
offset += Vector256<byte>.Count;
if (offset == searchSpaceMinusValueTailLength)
return -1;
// Overlap with the current chunk for trailing elements
if (offset > searchSpaceMinusValueTailLengthAndVector)
offset = searchSpaceMinusValueTailLengthAndVector;
continue;
CANDIDATE_FOUND:
uint mask = cmpAnd.ExtractMostSignificantBits();
do
{
int bitPos = BitOperations.TrailingZeroCount(mask);
if (valueLength == 2 || // we already matched two bytes
SequenceEqual(
ref Unsafe.Add(ref searchSpace, offset + bitPos),
ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
{
return (int)(offset + bitPos);
}
mask = BitOperations.ResetLowestSetBit(mask); // Clear the lowest set bit
} while (mask != 0);
goto LOOP_FOOTER;
} while (true);
}
else // 128bit vector path (SSE2 or AdvSimd)
{
// Find the last unique (which is not equal to ch1) byte
// the algorithm is fine if both are equal, just a little bit less efficient
byte ch2Val = Unsafe.Add(ref value, valueTailLength);
int ch1ch2Distance = valueTailLength;
while (ch2Val == value && ch1ch2Distance > 1)
ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
Vector128<byte> ch1 = Vector128.Create(value);
Vector128<byte> ch2 = Vector128.Create(ch2Val);
nint searchSpaceMinusValueTailLengthAndVector =
searchSpaceMinusValueTailLength - (nint)Vector128<byte>.Count;
do
{
Debug.Assert(offset >= 0);
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector128<byte>.Count <= searchSpaceLength);
Vector128<byte> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector128<byte> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
if (cmpAnd != Vector128<byte>.Zero)
{
goto CANDIDATE_FOUND;
}
LOOP_FOOTER:
offset += Vector128<byte>.Count;
if (offset == searchSpaceMinusValueTailLength)
return -1;
// Overlap with the current chunk for trailing elements
if (offset > searchSpaceMinusValueTailLengthAndVector)
offset = searchSpaceMinusValueTailLengthAndVector;
continue;
CANDIDATE_FOUND:
uint mask = cmpAnd.ExtractMostSignificantBits();
do
{
int bitPos = BitOperations.TrailingZeroCount(mask);
if (valueLength == 2 || // we already matched two bytes
SequenceEqual(
ref Unsafe.Add(ref searchSpace, offset + bitPos),
ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
{
return (int)(offset + bitPos);
}
// Clear the lowest set bit
mask = BitOperations.ResetLowestSetBit(mask);
} while (mask != 0);
goto LOOP_FOOTER;
} while (true);
}
}
public static int LastIndexOf(ref byte searchSpace, int searchSpaceLength, ref byte value, int valueLength)
{
Debug.Assert(searchSpaceLength >= 0);
Debug.Assert(valueLength >= 0);
if (valueLength == 0)
return searchSpaceLength; // A zero-length sequence is always treated as "found" at the end of the search space.
int valueTailLength = valueLength - 1;
if (valueTailLength == 0)
return LastIndexOfValueType(ref searchSpace, value, searchSpaceLength); // for single-byte values use plain LastIndexOf
int offset = 0;
byte valueHead = value;
int searchSpaceMinusValueTailLength = searchSpaceLength - valueTailLength;
if (Vector128.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector128<byte>.Count)
{
goto SEARCH_TWO_BYTES;
}
ref byte valueTail = ref Unsafe.Add(ref value, 1);
while (true)
{
Debug.Assert(0 <= offset && offset <= searchSpaceLength); // Ensures no deceptive underflows in the computation of "remainingSearchSpaceLength".
int remainingSearchSpaceLength = searchSpaceLength - offset - valueTailLength;
if (remainingSearchSpaceLength <= 0)
break; // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there.
// Do a quick search for the first element of "value".
int relativeIndex = LastIndexOfValueType(ref searchSpace, valueHead, remainingSearchSpaceLength);
if (relativeIndex < 0)
break;
// Found the first element of "value". See if the tail matches.
if (SequenceEqual(
ref Unsafe.Add(ref searchSpace, relativeIndex + 1),
ref valueTail, (nuint)(uint)valueTailLength)) // The (nuint)-cast is necessary to pick the correct overload
return relativeIndex; // The tail matched. Return a successful find.
offset += remainingSearchSpaceLength - relativeIndex;
}
return -1;
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_BYTES:
if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector512<byte>.Count)
{
offset = searchSpaceMinusValueTailLength - Vector512<byte>.Count;
// Find the last unique (which is not equal to ch1) byte
// the algorithm is fine if both are equal, just a little bit less efficient
byte ch2Val = Unsafe.Add(ref value, valueTailLength);
int ch1ch2Distance = valueTailLength;
while (ch2Val == value && ch1ch2Distance > 1)
ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
Vector512<byte> ch1 = Vector512.Create(value);
Vector512<byte> ch2 = Vector512.Create(ch2Val);
do
{
Vector512<byte> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector512<byte> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector512<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
if (cmpAnd != Vector512<byte>.Zero)
{
ulong mask = cmpAnd.ExtractMostSignificantBits();
do
{
// unlike IndexOf, here we use LZCNT to process matches starting from the end
int highestSetBitIndex = 63 - BitOperations.LeadingZeroCount(mask);
if (valueLength == 2 || // we already matched two bytes
SequenceEqual(
ref Unsafe.Add(ref searchSpace, offset + highestSetBitIndex),
ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
{
return highestSetBitIndex + offset;
}
// Clear the highest set bit.
mask = BitOperations.FlipBit(mask, highestSetBitIndex);
} while (mask != 0);
}
offset -= Vector512<byte>.Count;
if (offset == -Vector512<byte>.Count)
return -1;
// Overlap with the current chunk if there is not enough room for the next one
if (offset < 0)
offset = 0;
} while (true) ;
}
else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256<byte>.Count)
{
offset = searchSpaceMinusValueTailLength - Vector256<byte>.Count;
// Find the last unique (which is not equal to ch1) byte
// the algorithm is fine if both are equal, just a little bit less efficient
byte ch2Val = Unsafe.Add(ref value, valueTailLength);
int ch1ch2Distance = valueTailLength;
while (ch2Val == value && ch1ch2Distance > 1)
ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
Vector256<byte> ch1 = Vector256.Create(value);
Vector256<byte> ch2 = Vector256.Create(ch2Val);
do
{
Vector256<byte> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector256<byte> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
if (cmpAnd != Vector256<byte>.Zero)
{
uint mask = cmpAnd.ExtractMostSignificantBits();
do
{
// unlike IndexOf, here we use LZCNT to process matches starting from the end
int highestSetBitIndex = 31 - BitOperations.LeadingZeroCount(mask);
if (valueLength == 2 || // we already matched two bytes
SequenceEqual(
ref Unsafe.Add(ref searchSpace, offset + highestSetBitIndex),
ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
{
return highestSetBitIndex + offset;
}
// Clear the highest set bit.
mask = BitOperations.FlipBit(mask, highestSetBitIndex);
} while (mask != 0);
}
offset -= Vector256<byte>.Count;
if (offset == -Vector256<byte>.Count)
return -1;
// Overlap with the current chunk if there is not enough room for the next one
if (offset < 0)
offset = 0;
} while (true);
}
else // 128bit vector path (SSE2 or AdvSimd)
{
offset = searchSpaceMinusValueTailLength - Vector128<byte>.Count;
// Find the last unique (which is not equal to ch1) byte
// the algorithm is fine if both are equal, just a little bit less efficient
byte ch2Val = Unsafe.Add(ref value, valueTailLength);
int ch1ch2Distance = valueTailLength;
while (ch2Val == value && ch1ch2Distance > 1)
ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
Vector128<byte> ch1 = Vector128.Create(value);
Vector128<byte> ch2 = Vector128.Create(ch2Val);
do
{
Vector128<byte> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector128<byte> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
// it's especially important for ARM where ExtractMostSignificantBits is not cheap
if (cmpAnd != Vector128<byte>.Zero)
{
uint mask = cmpAnd.ExtractMostSignificantBits();
do
{
// unlike IndexOf, here we use LZCNT to process matches starting from the end
int highestSetBitIndex = 31 - BitOperations.LeadingZeroCount(mask);
if (valueLength == 2 || // we already matched two bytes
SequenceEqual(
ref Unsafe.Add(ref searchSpace, offset + highestSetBitIndex),
ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
{
return highestSetBitIndex + offset;
}
// Clear the highest set bit.
mask = BitOperations.FlipBit(mask, highestSetBitIndex);
} while (mask != 0);
}
offset -= Vector128<byte>.Count;
if (offset == -Vector128<byte>.Count)
return -1;
// Overlap with the current chunk if there is not enough room for the next one
if (offset < 0)
offset = 0;
} while (true);
}
}
[DoesNotReturn]
private static void ThrowMustBeNullTerminatedString()
{
throw new ArgumentException(SR.Arg_MustBeNullTerminatedString);
}
// IndexOfNullByte processes memory in aligned chunks, and thus it won't crash even if it accesses memory beyond the null terminator.
// This behavior is an implementation detail of the runtime and callers outside System.Private.CoreLib must not depend on it.
internal static unsafe int IndexOfNullByte(byte* searchSpace)
{
const int Length = int.MaxValue;
const uint uValue = 0; // Use uint for comparisons to avoid unnecessary 8->32 extensions
nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations
nuint lengthToExamine = (nuint)(uint)Length;
if (Vector128.IsHardwareAccelerated)
{
// Avx2 branch also operates on Sse2 sizes, so check is combined.
lengthToExamine = UnalignedCountVector128(searchSpace);
}
SequentialScan:
while (lengthToExamine >= 8)
{
lengthToExamine -= 8;
if (uValue == searchSpace[offset])
goto Found;
if (uValue == searchSpace[offset + 1])
goto Found1;
if (uValue == searchSpace[offset + 2])
goto Found2;
if (uValue == searchSpace[offset + 3])
goto Found3;
if (uValue == searchSpace[offset + 4])
goto Found4;
if (uValue == searchSpace[offset + 5])
goto Found5;
if (uValue == searchSpace[offset + 6])
goto Found6;
if (uValue == searchSpace[offset + 7])
goto Found7;
offset += 8;
}
if (lengthToExamine >= 4)
{
lengthToExamine -= 4;
if (uValue == searchSpace[offset])
goto Found;
if (uValue == searchSpace[offset + 1])
goto Found1;
if (uValue == searchSpace[offset + 2])
goto Found2;
if (uValue == searchSpace[offset + 3])
goto Found3;
offset += 4;
}
while (lengthToExamine > 0)
{
lengthToExamine -= 1;
if (uValue == searchSpace[offset])
goto Found;
offset += 1;
}
// We get past SequentialScan only if IsHardwareAccelerated is true; and remain length is greater than Vector length.
// However, we still have the redundant check to allow the JIT to see that the code is unreachable and eliminate it when the platform does not
// have hardware accelerated. After processing Vector lengths we return to SequentialScan to finish any remaining.
if (Vector512.IsHardwareAccelerated)
{
if (offset < (nuint)(uint)Length)
{
if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0)
{
// Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
// with no upper bound e.g. String.strlen.
// Start with a check on Vector128 to align to Vector256, before moving to processing Vector256.
// This ensures we do not fault across memory pages while searching for an end of string.
Vector128<byte> search = Vector128.Load(searchSpace + offset);
// Same method as below
uint matches = Vector128.Equals(Vector128<byte>.Zero, search).ExtractMostSignificantBits();
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector512<byte>.Count - 1)) != 0)
{
// Not currently aligned to Vector512 (is aligned to Vector256); this can cause a problem for searches
// with no upper bound e.g. String.strlen.
// Start with a check on Vector256 to align to Vector512, before moving to processing Vector256.
// This ensures we do not fault across memory pages while searching for an end of string.
Vector256<byte> search = Vector256.Load(searchSpace + offset);
// Same method as below
uint matches = Vector256.Equals(Vector256<byte>.Zero, search).ExtractMostSignificantBits();
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector256<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
lengthToExamine = GetByteVector512SpanLength(offset, Length);
if (lengthToExamine > offset)
{
do
{
Vector512<byte> search = Vector512.Load(searchSpace + offset);
ulong matches = Vector512.Equals(Vector512<byte>.Zero, search).ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector512<byte>.Count;
continue;
}
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
} while (lengthToExamine > offset);
}
lengthToExamine = GetByteVector256SpanLength(offset, Length);
if (lengthToExamine > offset)
{
Vector256<byte> search = Vector256.Load(searchSpace + offset);
// Same method as above
uint matches = Vector256.Equals(Vector256<byte>.Zero, search).ExtractMostSignificantBits();
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector256<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
lengthToExamine = GetByteVector128SpanLength(offset, Length);
if (lengthToExamine > offset)
{
Vector128<byte> search = Vector128.Load(searchSpace + offset);
// Same method as above
uint matches = Vector128.Equals(Vector128<byte>.Zero, search).ExtractMostSignificantBits();
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
if (offset < (nuint)(uint)Length)
{
lengthToExamine = ((nuint)(uint)Length - offset);
goto SequentialScan;
}
}
}
else if (Vector256.IsHardwareAccelerated)
{
if (offset < (nuint)(uint)Length)
{
if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0)
{
// Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
// with no upper bound e.g. String.strlen.
// Start with a check on Vector128 to align to Vector256, before moving to processing Vector256.
// This ensures we do not fault across memory pages while searching for an end of string.
Vector128<byte> search = Vector128.Load(searchSpace + offset);
// Same method as below
uint matches = Vector128.Equals(Vector128<byte>.Zero, search).ExtractMostSignificantBits();
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
lengthToExamine = GetByteVector256SpanLength(offset, Length);
if (lengthToExamine > offset)
{
do
{
Vector256<byte> search = Vector256.Load(searchSpace + offset);
uint matches = Vector256.Equals(Vector256<byte>.Zero, search).ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector256<byte>.Count;
continue;
}
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
} while (lengthToExamine > offset);
}
lengthToExamine = GetByteVector128SpanLength(offset, Length);
if (lengthToExamine > offset)
{
Vector128<byte> search = Vector128.Load(searchSpace + offset);
// Same method as above
uint matches = Vector128.Equals(Vector128<byte>.Zero, search).ExtractMostSignificantBits();
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
if (offset < (nuint)(uint)Length)
{
lengthToExamine = ((nuint)(uint)Length - offset);
goto SequentialScan;
}
}
}
else if (Vector128.IsHardwareAccelerated)
{
if (offset < (nuint)(uint)Length)
{
lengthToExamine = GetByteVector128SpanLength(offset, Length);
while (lengthToExamine > offset)
{
Vector128<byte> search = Vector128.Load(searchSpace + offset);
// Same method as above
Vector128<byte> compareResult = Vector128.Equals(Vector128<byte>.Zero, search);
if (compareResult == Vector128<byte>.Zero)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
continue;
}
// Find bitflag offset of first match and add to current offset
uint matches = compareResult.ExtractMostSignificantBits();
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
if (offset < (nuint)(uint)Length)
{
lengthToExamine = ((nuint)(uint)Length - offset);
goto SequentialScan;
}
}
}
ThrowMustBeNullTerminatedString();
Found: // Workaround for https://github.com/dotnet/runtime/issues/8795
return (int)offset;
Found1:
return (int)(offset + 1);
Found2:
return (int)(offset + 2);
Found3:
return (int)(offset + 3);
Found4:
return (int)(offset + 4);
Found5:
return (int)(offset + 5);
Found6:
return (int)(offset + 6);
Found7:
return (int)(offset + 7);
}
// Optimized byte-based SequenceEqual. The "length" parameter for this one is declared a nuint rather than int as we also use it for types other than byte
// where the length can exceed 2Gb once scaled by sizeof(T).
[Intrinsic] // Unrolled for constant length
public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint length)
{
bool result;
// Use nint for arithmetic to avoid unnecessary 64->32->64 truncations
if (length >= (nuint)sizeof(nuint))
{
// Conditional jmp forward to favor shorter lengths. (See comment at "Equal:" label)
// The longer lengths can make back the time due to branch misprediction
// better than shorter lengths.
goto Longer;
}
#if TARGET_64BIT
// On 32-bit, this will always be true since sizeof(nuint) == 4
if (length < sizeof(uint))
#endif
{
uint differentBits = 0;
nuint offset = (length & 2);
if (offset != 0)
{
differentBits = LoadUShort(ref first);
differentBits -= LoadUShort(ref second);
}
if ((length & 1) != 0)
{
differentBits |= (uint)Unsafe.AddByteOffset(ref first, offset) - (uint)Unsafe.AddByteOffset(ref second, offset);
}
result = (differentBits == 0);
goto Result;
}
#if TARGET_64BIT
else
{
nuint offset = length - sizeof(uint);
uint differentBits = LoadUInt(ref first) - LoadUInt(ref second);
differentBits |= LoadUInt(ref first, offset) - LoadUInt(ref second, offset);
result = (differentBits == 0);
goto Result;
}
#endif
Longer:
// Only check that the ref is the same if buffers are large,
// and hence its worth avoiding doing unnecessary comparisons
if (!Unsafe.AreSame(ref first, ref second))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Vector;
}
// This becomes a conditional jmp forward to not favor it.
goto Equal;
Result:
return result;
// When the sequence is equal; which is the longest execution, we want it to determine that
// as fast as possible so we do not want the early outs to be "predicted not taken" branches.
Equal:
return true;
Vector:
if (Vector128.IsHardwareAccelerated)
{
if (Vector512.IsHardwareAccelerated && length >= (nuint)Vector512<byte>.Count)
{
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector512<byte>.Count;
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
Debug.Assert(lengthToExamine < length);
if (lengthToExamine != 0)
{
do
{
if (Vector512.LoadUnsafe(ref first, offset) !=
Vector512.LoadUnsafe(ref second, offset))
{
goto NotEqual;
}
offset += (nuint)Vector512<byte>.Count;
} while (lengthToExamine > offset);
}
// Do final compare as Vector512<byte>.Count from end rather than start
if (Vector512.LoadUnsafe(ref first, lengthToExamine) ==
Vector512.LoadUnsafe(ref second, lengthToExamine))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Equal;
}
// This becomes a conditional jmp forward to not favor it.
goto NotEqual;
}
else if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256<byte>.Count)
{
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector256<byte>.Count;
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
Debug.Assert(lengthToExamine < length);
if (lengthToExamine != 0)
{
do
{
if (Vector256.LoadUnsafe(ref first, offset) !=
Vector256.LoadUnsafe(ref second, offset))
{
goto NotEqual;
}
offset += (nuint)Vector256<byte>.Count;
} while (lengthToExamine > offset);
}
// Do final compare as Vector256<byte>.Count from end rather than start
if (Vector256.LoadUnsafe(ref first, lengthToExamine) ==
Vector256.LoadUnsafe(ref second, lengthToExamine))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Equal;
}
// This becomes a conditional jmp forward to not favor it.
goto NotEqual;
}
else if (length >= (nuint)Vector128<byte>.Count)
{
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector128<byte>.Count;
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
Debug.Assert(lengthToExamine < length);
if (lengthToExamine != 0)
{
do
{
if (Vector128.LoadUnsafe(ref first, offset) !=
Vector128.LoadUnsafe(ref second, offset))
{
goto NotEqual;
}
offset += (nuint)Vector128<byte>.Count;
} while (lengthToExamine > offset);
}
// Do final compare as Vector128<byte>.Count from end rather than start
if (Vector128.LoadUnsafe(ref first, lengthToExamine) ==
Vector128.LoadUnsafe(ref second, lengthToExamine))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Equal;
}
// This becomes a conditional jmp forward to not favor it.
goto NotEqual;
}
}
#if TARGET_64BIT
if (Vector128.IsHardwareAccelerated)
{
Debug.Assert(length <= (nuint)sizeof(nuint) * 2);
nuint offset = length - (nuint)sizeof(nuint);
nuint differentBits = LoadNUInt(ref first) - LoadNUInt(ref second);
differentBits |= LoadNUInt(ref first, offset) - LoadNUInt(ref second, offset);
result = (differentBits == 0);
goto Result;
}
else
#endif
{
Debug.Assert(length >= (nuint)sizeof(nuint));
{
nuint offset = 0;
nuint lengthToExamine = length - (nuint)sizeof(nuint);
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
Debug.Assert(lengthToExamine < length);
if (lengthToExamine > 0)
{
do
{
// Compare unsigned so not do a sign extend mov on 64 bit
if (LoadNUInt(ref first, offset) != LoadNUInt(ref second, offset))
{
goto NotEqual;
}
offset += (nuint)sizeof(nuint);
} while (lengthToExamine > offset);
}
// Do final compare as sizeof(nuint) from end rather than start
result = (LoadNUInt(ref first, lengthToExamine) == LoadNUInt(ref second, lengthToExamine));
goto Result;
}
}
// As there are so many true/false exit points the Jit will coalesce them to one location.
// We want them at the end so the conditional early exit jmps are all jmp forwards so the
// branch predictor in a uninitialized state will not take them e.g.
// - loops are conditional jmps backwards and predicted
// - exceptions are conditional forwards jmps and not predicted
NotEqual:
return false;
}
public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref byte second, int secondLength)
{
Debug.Assert(firstLength >= 0);
Debug.Assert(secondLength >= 0);
if (Unsafe.AreSame(ref first, ref second))
goto Equal;
nuint minLength = (nuint)(((uint)firstLength < (uint)secondLength) ? (uint)firstLength : (uint)secondLength);
nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations
nuint lengthToExamine = minLength;
if (Vector256.IsHardwareAccelerated)
{
if (Vector512.IsHardwareAccelerated && (lengthToExamine >= (nuint)Vector512<byte>.Count))
{
lengthToExamine -= (nuint)Vector512<byte>.Count;
ulong matches;
while (lengthToExamine > offset)
{
matches = Vector512.Equals(Vector512.LoadUnsafe(ref first, offset), Vector512.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
// 32 elements in Vector256<byte> so we compare to uint.MaxValue to check if everything matched
if (matches == ulong.MaxValue)
{
// All matched
offset += (nuint)Vector512<byte>.Count;
continue;
}
goto Difference;
}
// Move to Vector length from end for final compare
offset = lengthToExamine;
// Same as method as above
matches = Vector512.Equals(Vector512.LoadUnsafe(ref first, offset), Vector512.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
if (matches == ulong.MaxValue)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
ulong differences = ~matches;
// Find bitflag offset of first difference and add to current offset
offset += (uint)BitOperations.TrailingZeroCount(differences);
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
if (lengthToExamine >= (nuint)Vector256<byte>.Count)
{
lengthToExamine -= (nuint)Vector256<byte>.Count;
uint matches;
while (lengthToExamine > offset)
{
matches = Vector256.Equals(Vector256.LoadUnsafe(ref first, offset), Vector256.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
// 32 elements in Vector256<byte> so we compare to uint.MaxValue to check if everything matched
if (matches == uint.MaxValue)
{
// All matched
offset += (nuint)Vector256<byte>.Count;
continue;
}
goto Difference;
}
// Move to Vector length from end for final compare
offset = lengthToExamine;
// Same as method as above
matches = Vector256.Equals(Vector256.LoadUnsafe(ref first, offset), Vector256.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
if (matches == uint.MaxValue)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
uint differences = ~matches;
// Find bitflag offset of first difference and add to current offset
offset += (uint)BitOperations.TrailingZeroCount(differences);
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
if (lengthToExamine >= (nuint)Vector128<byte>.Count)
{
lengthToExamine -= (nuint)Vector128<byte>.Count;
uint matches;
if (lengthToExamine > offset)
{
matches = Vector128.Equals(Vector128.LoadUnsafe(ref first, offset), Vector128.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
// 16 elements in Vector128<byte> so we compare to ushort.MaxValue to check if everything matched
if (matches != ushort.MaxValue)
{
goto Difference;
}
}
// Move to Vector length from end for final compare
offset = lengthToExamine;
// Same as method as above
matches = Vector128.Equals(Vector128.LoadUnsafe(ref first, offset), Vector128.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
if (matches == ushort.MaxValue)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
uint differences = ~matches;
// Find bitflag offset of first difference and add to current offset
offset += (uint)BitOperations.TrailingZeroCount(differences);
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
}
else if (Vector128.IsHardwareAccelerated)
{
if (lengthToExamine >= (nuint)Vector128<byte>.Count)
{
lengthToExamine -= (nuint)Vector128<byte>.Count;
while (lengthToExamine > offset)
{
if (Vector128.LoadUnsafe(ref first, offset) == Vector128.LoadUnsafe(ref second, offset))
{
// All matched
offset += (nuint)Vector128<byte>.Count;
continue;
}
goto BytewiseCheck;
}
// Move to Vector length from end for final compare
offset = lengthToExamine;
if (Vector128.LoadUnsafe(ref first, offset) == Vector128.LoadUnsafe(ref second, offset))
{
// All matched
goto Equal;
}
goto BytewiseCheck;
}
}
if (lengthToExamine > (nuint)sizeof(nuint))
{
lengthToExamine -= (nuint)sizeof(nuint);
while (lengthToExamine > offset)
{
if (LoadNUInt(ref first, offset) != LoadNUInt(ref second, offset))
{
goto BytewiseCheck;
}
offset += (nuint)sizeof(nuint);
}
}
BytewiseCheck: // Workaround for https://github.com/dotnet/runtime/issues/8795
while (minLength > offset)
{
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
if (result != 0)
return result;
offset += 1;
}
Equal:
return firstLength - secondLength;
}
public static nuint CommonPrefixLength(ref byte first, ref byte second, nuint length)
{
nuint i;
// It is ordered this way to match the default branch predictor rules, to don't have too much
// overhead for short input-lengths.
if (!Vector128.IsHardwareAccelerated || length < (nuint)Vector128<byte>.Count)
{
// To have kind of fast path for small inputs, we handle as much elements needed
// so that either we are done or can use the unrolled loop below.
i = length % 4;
if (i > 0)
{
if (first != second)
{
return 0;
}
if (i > 1)
{
if (Unsafe.Add(ref first, 1) != Unsafe.Add(ref second, 1))
{
return 1;
}
if (i > 2 && Unsafe.Add(ref first, 2) != Unsafe.Add(ref second, 2))
{
return 2;
}
}
}
for (; (nint)i <= (nint)length - 4; i += 4)
{
if (Unsafe.Add(ref first, i + 0) != Unsafe.Add(ref second, i + 0)) goto Found0;
if (Unsafe.Add(ref first, i + 1) != Unsafe.Add(ref second, i + 1)) goto Found1;
if (Unsafe.Add(ref first, i + 2) != Unsafe.Add(ref second, i + 2)) goto Found2;
if (Unsafe.Add(ref first, i + 3) != Unsafe.Add(ref second, i + 3)) goto Found3;
}
return length;
Found0:
return i;
Found1:
return i + 1;
Found2:
return i + 2;
Found3:
return i + 3;
}
Debug.Assert(length >= (uint)Vector128<byte>.Count);
uint mask;
nuint lengthToExamine = length - (nuint)Vector128<byte>.Count;
Vector128<byte> maskVec;
i = 0;
while (i < lengthToExamine)
{
maskVec = Vector128.Equals(
Vector128.LoadUnsafe(ref first, i),
Vector128.LoadUnsafe(ref second, i));
mask = maskVec.ExtractMostSignificantBits();
if (mask != 0xFFFF)
{
goto Found;
}
i += (nuint)Vector128<byte>.Count;
}
// Do final compare as Vector128<byte>.Count from end rather than start
i = lengthToExamine;
maskVec = Vector128.Equals(
Vector128.LoadUnsafe(ref first, i),
Vector128.LoadUnsafe(ref second, i));
mask = maskVec.ExtractMostSignificantBits();
if (mask != 0xFFFF)
{
goto Found;
}
return length;
Found:
mask = ~mask;
return i + uint.TrailingZeroCount(mask);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundByte(ulong match)
=> BitOperations.TrailingZeroCount(match) >> 3;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateLastFoundByte(ulong match)
=> BitOperations.Log2(match) >> 3;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ushort LoadUShort(ref byte start)
=> Unsafe.ReadUnaligned<ushort>(ref start);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint LoadUInt(ref byte start)
=> Unsafe.ReadUnaligned<uint>(ref start);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint LoadUInt(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint LoadNUInt(ref byte start)
=> Unsafe.ReadUnaligned<nuint>(ref start);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint LoadNUInt(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<nuint>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> LoadVector128(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> LoadVector256(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint GetByteVector128SpanLength(nuint offset, int length)
=> (nuint)(uint)((length - (int)offset) & ~(Vector128<byte>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint GetByteVector256SpanLength(nuint offset, int length)
=> (nuint)(uint)((length - (int)offset) & ~(Vector256<byte>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint GetByteVector512SpanLength(nuint offset, int length)
=> (nuint)(uint)((length - (int)offset) & ~(Vector512<byte>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nuint UnalignedCountVector128(byte* searchSpace)
{
nint unaligned = (nint)searchSpace & (Vector128<byte>.Count - 1);
return (nuint)(uint)((Vector128<byte>.Count - unaligned) & (Vector128<byte>.Count - 1));
}
public static void Reverse(ref byte buf, nuint length)
{
Debug.Assert(length > 1);
nint remainder = (nint)length;
nint offset = 0;
if (Vector512.IsHardwareAccelerated && remainder >= Vector512<byte>.Count * 2)
{
nint lastOffset = remainder - Vector512<byte>.Count;
do
{
// Load the values into vectors
Vector512<byte> tempFirst = Vector512.LoadUnsafe(ref buf, (nuint)offset);
Vector512<byte> tempLast = Vector512.LoadUnsafe(ref buf, (nuint)lastOffset);
// Shuffle to reverse each vector:
// +---------------------------------------------------------------+
// | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
// +---------------------------------------------------------------+
// --->
// +---------------------------------------------------------------+
// | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
// +---------------------------------------------------------------+
tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create(
(byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector512.Shuffle(tempLast, Vector512.Create(
(byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
// Store the reversed vectors
tempLast.StoreUnsafe(ref buf, (nuint)offset);
tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);
offset += Vector512<byte>.Count;
lastOffset -= Vector512<byte>.Count;
} while (lastOffset >= offset);
remainder = lastOffset + Vector512<byte>.Count - offset;
}
else if (Avx2.IsSupported && remainder >= (nint)(Vector256<byte>.Count * 1.5))
{
Vector256<byte> reverseMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
nint lastOffset = remainder - Vector256<byte>.Count;
do
{
// Load the values into vectors
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref buf, (nuint)offset);
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref buf, (nuint)lastOffset);
// Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
// Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
// +-------------------------------------------------------------------------------+
// | A1 | B1 | C1 | D1 | E1 | F1 | G1 | H1 | I1 | J1 | K1 | L1 | M1 | N1 | O1 | P1 |
// +-------------------------------------------------------------------------------+
// | A2 | B2 | C2 | D2 | E2 | F2 | G2 | H2 | I2 | J2 | K2 | L2 | M2 | N2 | O2 | P2 |
// +-------------------------------------------------------------------------------+
// Shuffle --->
// +-------------------------------------------------------------------------------+
// | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
// +-------------------------------------------------------------------------------+
// | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
// +-------------------------------------------------------------------------------+
// Permute --->
// +-------------------------------------------------------------------------------+
// | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
// +-------------------------------------------------------------------------------+
// | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
// +-------------------------------------------------------------------------------+
tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
tempLast = Avx2.Shuffle(tempLast, reverseMask);
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);
// Store the reversed vectors
tempLast.StoreUnsafe(ref buf, (nuint)offset);
tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);
offset += Vector256<byte>.Count;
lastOffset -= Vector256<byte>.Count;
} while (lastOffset >= offset);
remainder = lastOffset + Vector256<byte>.Count - offset;
}
else if (Vector128.IsHardwareAccelerated && remainder >= Vector128<byte>.Count * 2)
{
nint lastOffset = remainder - Vector128<byte>.Count;
do
{
// Load the values into vectors
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref buf, (nuint)offset);
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref buf, (nuint)lastOffset);
// Shuffle to reverse each vector:
// +---------------------------------------------------------------+
// | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
// +---------------------------------------------------------------+
// --->
// +---------------------------------------------------------------+
// | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
// +---------------------------------------------------------------+
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
// Store the reversed vectors
tempLast.StoreUnsafe(ref buf, (nuint)offset);
tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);
offset += Vector128<byte>.Count;
lastOffset -= Vector128<byte>.Count;
} while (lastOffset >= offset);
remainder = lastOffset + Vector128<byte>.Count - offset;
}
if (remainder >= sizeof(long))
{
nint lastOffset = (nint)length - offset - sizeof(long);
do
{
long tempFirst = Unsafe.ReadUnaligned<long>(ref Unsafe.Add(ref buf, offset));
long tempLast = Unsafe.ReadUnaligned<long>(ref Unsafe.Add(ref buf, lastOffset));
// swap and store in reversed position
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, offset), BinaryPrimitives.ReverseEndianness(tempLast));
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, lastOffset), BinaryPrimitives.ReverseEndianness(tempFirst));
offset += sizeof(long);
lastOffset -= sizeof(long);
} while (lastOffset >= offset);
remainder = lastOffset + sizeof(long) - offset;
}
if (remainder >= sizeof(int))
{
nint lastOffset = (nint)length - offset - sizeof(int);
do
{
int tempFirst = Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref buf, offset));
int tempLast = Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref buf, lastOffset));
// swap and store in reversed position
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, offset), BinaryPrimitives.ReverseEndianness(tempLast));
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, lastOffset), BinaryPrimitives.ReverseEndianness(tempFirst));
offset += sizeof(int);
lastOffset -= sizeof(int);
} while (lastOffset >= offset);
remainder = lastOffset + sizeof(int) - offset;
}
if (remainder > 1)
{
ReverseInner(ref Unsafe.Add(ref buf, offset), (nuint)remainder);
}
}
}
}
|