|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
#if NET
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using static System.IO.Hashing.VectorHelper;
namespace System.IO.Hashing
{
public partial class Crc32ParameterSet
{
private partial class ReflectedCrc32
{
private readonly int _shouldVectorizeScale = 1;
private Vector128<ulong> _k1k2;
private Vector128<ulong> _k3k4;
private ulong _k4;
private ulong _k6;
private Vector128<ulong> _polyMu;
protected ReflectedCrc32(int shouldVectorizeScale, uint polynomial, uint initialValue, uint finalXorValue)
: this(polynomial, initialValue, finalXorValue)
{
Debug.Assert(shouldVectorizeScale > 0);
_shouldVectorizeScale = shouldVectorizeScale;
}
partial void InitializeVectorized(ref bool canVectorize)
{
if (!BitConverter.IsLittleEndian || !VectorHelper.IsSupported)
{
return;
}
ulong fullPoly = (1UL << 32) | Polynomial;
ulong k1 = ReflectConstant33(CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 4 * 128 + 32));
ulong k2 = ReflectConstant33(CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 4 * 128 - 32));
ulong k3 = ReflectConstant33(CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 128 + 32));
ulong k4 = ReflectConstant33(CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 128 - 32));
ulong k5 = ReflectConstant33(CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 64));
ulong mu = CrcPolynomialHelper.ComputeBarrettConstantCrc32(fullPoly);
_k1k2 = Vector128.Create(k1, k2);
_k3k4 = Vector128.Create(k3, k4);
_k4 = k4;
_k6 = k5;
_polyMu = Vector128.Create(ReflectConstant33(fullPoly), ReflectConstant33(mu));
canVectorize = true;
static ulong ReflectConstant33(ulong value)
{
ulong reversed = Crc64ParameterSet.ReverseBits(value);
return reversed >> 31;
}
}
partial void UpdateVectorized(ref uint crc, ReadOnlySpan<byte> source, ref int bytesConsumed)
{
if (!_canVectorize || source.Length < _shouldVectorizeScale * Vector128<byte>.Count)
{
return;
}
crc = UpdateVectorizedCore(crc, source, out bytesConsumed);
}
[MethodImpl(MethodImplOptions.NoInlining)]
private uint UpdateVectorizedCore(uint crc, ReadOnlySpan<byte> source, out int bytesConsumed)
{
ref byte srcRef = ref MemoryMarshal.GetReference(source);
int length = source.Length;
Vector128<ulong> x1;
Vector128<ulong> x2;
if (length >= Vector128<byte>.Count * 4)
{
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
x1 ^= Vector128.CreateScalar(crc).AsUInt64();
while (length >= Vector128<byte>.Count * 4)
{
Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
x1 = FoldPolynomialPair(y5, x1, _k1k2);
x2 = FoldPolynomialPair(y6, x2, _k1k2);
x3 = FoldPolynomialPair(y7, x3, _k1k2);
x4 = FoldPolynomialPair(y8, x4, _k1k2);
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
}
x1 = FoldPolynomialPair(x2, x1, _k3k4);
x1 = FoldPolynomialPair(x3, x1, _k3k4);
x1 = FoldPolynomialPair(x4, x1, _k3k4);
}
else
{
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
x1 ^= Vector128.CreateScalar(crc).AsUInt64();
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}
while (length >= Vector128<byte>.Count)
{
x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1, _k3k4);
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}
// Fold 128 bits to 64 bits.
Vector128<ulong> bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
x1 = ShiftRightBytesInVector(x1, 8) ^
CarrylessMultiplyLower(x1, Vector128.CreateScalar(_k4));
x1 = CarrylessMultiplyLower(x1 & bitmask, Vector128.CreateScalar(_k6)) ^
ShiftRightBytesInVector(x1, 4);
// Barrett reduction to 32 bits.
x2 = CarrylessMultiplyLeftLowerRightUpper(x1 & bitmask, _polyMu) & bitmask;
x2 = CarrylessMultiplyLower(x2, _polyMu);
x1 ^= x2;
bytesConsumed = source.Length - length;
return x1.AsUInt32().GetElement(1);
}
}
private partial class ForwardCrc32
{
private Vector128<ulong> _k1k2;
private Vector128<ulong> _k3k4;
private Vector128<ulong> _foldConstants;
private ulong _k6;
private ulong _mu;
partial void InitializeVectorized(ref bool canVectorize)
{
if (!BitConverter.IsLittleEndian || !VectorHelper.IsSupported)
{
return;
}
ulong fullPoly = 1UL << 32 | Polynomial;
ulong k1 = CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 4 * 128 + 64);
ulong k2 = CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 4 * 128);
ulong k3 = CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 128 + 64);
ulong k4 = CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 128);
ulong k5 = CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 96);
ulong k6 = CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 64);
_k1k2 = Vector128.Create(k2, k1);
_k3k4 = Vector128.Create(k4, k3);
_k6 = k6;
_foldConstants = Vector128.Create(
CrcPolynomialHelper.ComputeFoldingConstantCrc32(fullPoly, 32),
k5);
_mu = CrcPolynomialHelper.ComputeBarrettConstantCrc32(fullPoly);
canVectorize = true;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<ulong> LoadReversed(ref byte source, nuint elementOffset)
{
Vector128<byte> vector = Vector128.LoadUnsafe(ref source, elementOffset);
if (BitConverter.IsLittleEndian)
{
vector = Vector128.Shuffle(
vector,
Vector128.Create(
(byte)0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00));
}
return vector.AsUInt64();
}
partial void UpdateVectorized(ref uint crc, ReadOnlySpan<byte> source, ref int bytesConsumed)
{
if (!_canVectorize || source.Length < Vector128<byte>.Count)
{
return;
}
crc = UpdateVectorizedCore(crc, source, out bytesConsumed);
}
[MethodImpl(MethodImplOptions.NoInlining)]
private uint UpdateVectorizedCore(uint crc, ReadOnlySpan<byte> source, out int bytesConsumed)
{
ref byte srcRef = ref MemoryMarshal.GetReference(source);
int length = source.Length;
Vector128<ulong> x1;
if (length >= Vector128<byte>.Count * 4)
{
x1 = LoadReversed(ref srcRef, 0);
Vector128<ulong> x2 = LoadReversed(ref srcRef, 16);
Vector128<ulong> x3 = LoadReversed(ref srcRef, 32);
Vector128<ulong> x4 = LoadReversed(ref srcRef, 48);
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
x1 ^= ShiftLowerToUpper(Vector128.CreateScalar((ulong)crc << 32));
while (length >= Vector128<byte>.Count * 4)
{
Vector128<ulong> y5 = LoadReversed(ref srcRef, 0);
Vector128<ulong> y6 = LoadReversed(ref srcRef, 16);
Vector128<ulong> y7 = LoadReversed(ref srcRef, 32);
Vector128<ulong> y8 = LoadReversed(ref srcRef, 48);
x1 = FoldPolynomialPair(y5, x1, _k1k2);
x2 = FoldPolynomialPair(y6, x2, _k1k2);
x3 = FoldPolynomialPair(y7, x3, _k1k2);
x4 = FoldPolynomialPair(y8, x4, _k1k2);
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
}
x1 = FoldPolynomialPair(x2, x1, _k3k4);
x1 = FoldPolynomialPair(x3, x1, _k3k4);
x1 = FoldPolynomialPair(x4, x1, _k3k4);
}
else
{
x1 = LoadReversed(ref srcRef, 0);
x1 ^= ShiftLowerToUpper(Vector128.CreateScalar((ulong)crc << 32));
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}
while (length >= Vector128<byte>.Count)
{
x1 = FoldPolynomialPair(LoadReversed(ref srcRef, 0), x1, _k3k4);
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}
x1 = FoldPolynomialPair(Vector128<ulong>.Zero, x1, _foldConstants);
Vector128<ulong> lowerMask = Vector128.Create(~0UL, 0UL);
x1 = CarrylessMultiplyLeftUpperRightLower(x1, Vector128.CreateScalar(_k6)) ^ (x1 & lowerMask);
Vector128<ulong> bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
Vector128<ulong> temp = x1;
x1 = ShiftRightBytesInVector(x1, 4) & bitmask;
x1 = CarrylessMultiplyLower(x1, Vector128.CreateScalar(_mu));
x1 = ShiftRightBytesInVector(x1, 4) & bitmask;
x1 = CarrylessMultiplyLower(x1, Vector128.CreateScalar<ulong>(Polynomial));
x1 ^= temp;
bytesConsumed = source.Length - length;
return x1.AsUInt32().GetElement(0);
}
}
}
}
#endif
|