|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
// Copied from: https://github.com/dotnet/runtime/blob/f1d463f46268382a8287d71aea929dadaa5dfef5/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHashShared.cs#L1
// Remove once we can actually add a reference to System.IO.Hashing v8.0.0
// <auto-generated/>
using System.Buffers.Binary;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if NET8_0_OR_GREATER
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
#endif
namespace System.IO.Hashing
{
// /// <summary>Shared implementation of the XXH3 hash algorithm for 64-bit in <see cref="XxHash3"/> and <see cref="XxHash128"/> version.</summary>
#if NET5_0_OR_GREATER
[SkipLocalsInit]
#endif
internal static unsafe class XxHashShared
{
public const int StripeLengthBytes = 64;
public const int SecretLengthBytes = 192;
public const int SecretSizeMin = 136;
public const int SecretLastAccStartBytes = 7;
public const int SecretConsumeRateBytes = 8;
public const int SecretMergeAccsStartBytes = 11;
public const int NumStripesPerBlock = (SecretLengthBytes - StripeLengthBytes) / SecretConsumeRateBytes;
public const int AccumulatorCount = StripeLengthBytes / sizeof(ulong);
public const int MidSizeMaxBytes = 240;
public const int InternalBufferStripes = InternalBufferLengthBytes / StripeLengthBytes;
public const int InternalBufferLengthBytes = 256;
/// <summary>The default secret for when no seed is provided.</summary>
/// <remarks>This is the same as a custom secret derived from a seed of 0.</remarks>
public static ReadOnlySpan<byte> DefaultSecret =>
[
0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, // DefaultSecretUInt64_0
0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, // DefaultSecretUInt64_1
0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, // DefaultSecretUInt64_2
0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, // DefaultSecretUInt64_3
0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, // DefaultSecretUInt64_4
0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, // DefaultSecretUInt64_5
0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, // DefaultSecretUInt64_6
0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, // DefaultSecretUInt64_7
0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, // DefaultSecretUInt64_8
0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, // DefaultSecretUInt64_9
0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, // DefaultSecretUInt64_10
0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, // DefaultSecretUInt64_11
0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, // DefaultSecretUInt64_12
0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, // DefaultSecretUInt64_13
0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, // DefaultSecretUInt64_14
0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, // DefaultSecretUInt64_15
0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, // DefaultSecretUInt64_16
0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, // DefaultSecretUInt64_17
0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, // DefaultSecretUInt64_18
0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, // DefaultSecretUInt64_19
0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, // DefaultSecretUInt64_20
0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, // DefaultSecretUInt64_21
0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, // DefaultSecretUInt64_22
0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, // DefaultSecretUInt64_23
];
// Cast of DefaultSecret byte[] => ulong[] (See above for the correspondence)
public const ulong DefaultSecretUInt64_0 = 0xBE4BA423396CFEB8;
public const ulong DefaultSecretUInt64_1 = 0x1CAD21F72C81017C;
public const ulong DefaultSecretUInt64_2 = 0xDB979083E96DD4DE;
public const ulong DefaultSecretUInt64_3 = 0x1F67B3B7A4A44072;
public const ulong DefaultSecretUInt64_4 = 0x78E5C0CC4EE679CB;
public const ulong DefaultSecretUInt64_5 = 0x2172FFCC7DD05A82;
public const ulong DefaultSecretUInt64_6 = 0x8E2443F7744608B8;
public const ulong DefaultSecretUInt64_7 = 0x4C263A81E69035E0;
public const ulong DefaultSecretUInt64_8 = 0xCB00C391BB52283C;
public const ulong DefaultSecretUInt64_9 = 0xA32E531B8B65D088;
public const ulong DefaultSecretUInt64_10 = 0x4EF90DA297486471;
public const ulong DefaultSecretUInt64_11 = 0xD8ACDEA946EF1938;
public const ulong DefaultSecretUInt64_12 = 0x3F349CE33F76FAA8;
public const ulong DefaultSecretUInt64_13 = 0x1D4F0BC7C7BBDCF9;
public const ulong DefaultSecretUInt64_14 = 0x3159B4CD4BE0518A;
public const ulong DefaultSecretUInt64_15 = 0x647378D9C97E9FC8;
// Cast of DefaultSecret offset by 3 bytes, byte[] => ulong[]
public const ulong DefaultSecret3UInt64_0 = 0x81017CBE4BA42339;
public const ulong DefaultSecret3UInt64_1 = 0x6DD4DE1CAD21F72C;
public const ulong DefaultSecret3UInt64_2 = 0xA44072DB979083E9;
public const ulong DefaultSecret3UInt64_3 = 0xE679CB1F67B3B7A4;
public const ulong DefaultSecret3UInt64_4 = 0xD05A8278E5C0CC4E;
public const ulong DefaultSecret3UInt64_5 = 0x4608B82172FFCC7D;
public const ulong DefaultSecret3UInt64_6 = 0x9035E08E2443F774;
public const ulong DefaultSecret3UInt64_7 = 0x52283C4C263A81E6;
public const ulong DefaultSecret3UInt64_8 = 0x65D088CB00C391BB;
public const ulong DefaultSecret3UInt64_9 = 0x486471A32E531B8B;
public const ulong DefaultSecret3UInt64_10 = 0xEF19384EF90DA297;
public const ulong DefaultSecret3UInt64_11 = 0x76FAA8D8ACDEA946;
public const ulong DefaultSecret3UInt64_12 = 0xBBDCF93F349CE33F;
public const ulong DefaultSecret3UInt64_13 = 0xE0518A1D4F0BC7C7;
public const ulong Prime64_1 = 0x9E3779B185EBCA87UL;
public const ulong Prime64_2 = 0xC2B2AE3D27D4EB4FUL;
public const ulong Prime64_3 = 0x165667B19E3779F9UL;
public const ulong Prime64_4 = 0x85EBCA77C2B2AE63UL;
public const ulong Prime64_5 = 0x27D4EB2F165667C5UL;
public const uint Prime32_1 = 0x9E3779B1U;
public const uint Prime32_2 = 0x85EBCA77U;
public const uint Prime32_3 = 0xC2B2AE3DU;
public const uint Prime32_4 = 0x27D4EB2FU;
public const uint Prime32_5 = 0x165667B1U;
#if DEBUG
static XxHashShared()
{
// Make sure DefaultSecret is the custom secret derived from a seed of 0.
byte* secret = stackalloc byte[SecretLengthBytes];
DeriveSecretFromSeed(secret, 0);
Debug.Assert(new Span<byte>(secret, SecretLengthBytes).SequenceEqual(DefaultSecret));
// Validate some relationships.
Debug.Assert(InternalBufferLengthBytes % StripeLengthBytes == 0);
ReadOnlySpan<ulong> defaultSecretUInt64 = MemoryMarshal.Cast<byte, ulong>(DefaultSecret);
Debug.Assert(ReadLE64(defaultSecretUInt64[0]) == DefaultSecretUInt64_0);
Debug.Assert(ReadLE64(defaultSecretUInt64[1]) == DefaultSecretUInt64_1);
Debug.Assert(ReadLE64(defaultSecretUInt64[2]) == DefaultSecretUInt64_2);
Debug.Assert(ReadLE64(defaultSecretUInt64[3]) == DefaultSecretUInt64_3);
Debug.Assert(ReadLE64(defaultSecretUInt64[4]) == DefaultSecretUInt64_4);
Debug.Assert(ReadLE64(defaultSecretUInt64[5]) == DefaultSecretUInt64_5);
Debug.Assert(ReadLE64(defaultSecretUInt64[6]) == DefaultSecretUInt64_6);
Debug.Assert(ReadLE64(defaultSecretUInt64[7]) == DefaultSecretUInt64_7);
Debug.Assert(ReadLE64(defaultSecretUInt64[8]) == DefaultSecretUInt64_8);
Debug.Assert(ReadLE64(defaultSecretUInt64[9]) == DefaultSecretUInt64_9);
Debug.Assert(ReadLE64(defaultSecretUInt64[10]) == DefaultSecretUInt64_10);
Debug.Assert(ReadLE64(defaultSecretUInt64[11]) == DefaultSecretUInt64_11);
Debug.Assert(ReadLE64(defaultSecretUInt64[12]) == DefaultSecretUInt64_12);
Debug.Assert(ReadLE64(defaultSecretUInt64[13]) == DefaultSecretUInt64_13);
Debug.Assert(ReadLE64(defaultSecretUInt64[14]) == DefaultSecretUInt64_14);
Debug.Assert(ReadLE64(defaultSecretUInt64[15]) == DefaultSecretUInt64_15);
ReadOnlySpan<ulong> defaultSecret3UInt64 = MemoryMarshal.Cast<byte, ulong>(DefaultSecret.Slice(3));
Debug.Assert(ReadLE64(defaultSecret3UInt64[0]) == DefaultSecret3UInt64_0);
Debug.Assert(ReadLE64(defaultSecret3UInt64[1]) == DefaultSecret3UInt64_1);
Debug.Assert(ReadLE64(defaultSecret3UInt64[2]) == DefaultSecret3UInt64_2);
Debug.Assert(ReadLE64(defaultSecret3UInt64[3]) == DefaultSecret3UInt64_3);
Debug.Assert(ReadLE64(defaultSecret3UInt64[4]) == DefaultSecret3UInt64_4);
Debug.Assert(ReadLE64(defaultSecret3UInt64[5]) == DefaultSecret3UInt64_5);
Debug.Assert(ReadLE64(defaultSecret3UInt64[6]) == DefaultSecret3UInt64_6);
Debug.Assert(ReadLE64(defaultSecret3UInt64[7]) == DefaultSecret3UInt64_7);
Debug.Assert(ReadLE64(defaultSecret3UInt64[8]) == DefaultSecret3UInt64_8);
Debug.Assert(ReadLE64(defaultSecret3UInt64[9]) == DefaultSecret3UInt64_9);
Debug.Assert(ReadLE64(defaultSecret3UInt64[10]) == DefaultSecret3UInt64_10);
Debug.Assert(ReadLE64(defaultSecret3UInt64[11]) == DefaultSecret3UInt64_11);
Debug.Assert(ReadLE64(defaultSecret3UInt64[12]) == DefaultSecret3UInt64_12);
Debug.Assert(ReadLE64(defaultSecret3UInt64[13]) == DefaultSecret3UInt64_13);
static ulong ReadLE64(ulong data) => BitConverter.IsLittleEndian ? data : BinaryPrimitives.ReverseEndianness(data);
}
#endif
public static void Initialize(ref State state, ulong seed)
{
state.Seed = (ulong)seed;
fixed (byte* secret = state.Secret)
{
if (seed == 0)
{
DefaultSecret.CopyTo(new Span<byte>(secret, SecretLengthBytes));
}
else
{
DeriveSecretFromSeed(secret, (ulong)seed);
}
}
Reset(ref state);
}
public static void Reset(ref State state)
{
state.BufferedCount = 0;
state.StripesProcessedInCurrentBlock = 0;
state.TotalLength = 0;
fixed (ulong* accumulators = state.Accumulators)
{
InitializeAccumulators(accumulators);
}
}
#if false
/// <summary>"This is a stronger avalanche, preferable when input has not been previously mixed."</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong Rrmxmx(ulong hash, uint length)
{
hash ^= BitOperations.RotateLeft(hash, 49) ^ BitOperations.RotateLeft(hash, 24);
hash *= 0x9FB21C651E98DF25;
hash ^= (hash >> 35) + length;
hash *= 0x9FB21C651E98DF25;
return XorShift(hash, 28);
}
#endif
public static void HashInternalLoop(ulong* accumulators, byte* source, uint length, byte* secret)
{
Debug.Assert(length > 240);
const int StripesPerBlock = (SecretLengthBytes - StripeLengthBytes) / SecretConsumeRateBytes;
const int BlockLen = StripeLengthBytes * StripesPerBlock;
int blocksNum = (int)((length - 1) / BlockLen);
Accumulate(accumulators, source, secret, StripesPerBlock, true, blocksNum);
int offset = BlockLen * blocksNum;
int stripesNumber = (int)((length - 1 - offset) / StripeLengthBytes);
Accumulate(accumulators, source + offset, secret, stripesNumber);
Accumulate512(accumulators, source + length - StripeLengthBytes, secret + (SecretLengthBytes - StripeLengthBytes - SecretLastAccStartBytes));
}
public static void ConsumeStripes(ulong* accumulators, ref ulong stripesSoFar, ulong stripesPerBlock, byte* source, ulong stripes, byte* secret)
{
Debug.Assert(stripes <= stripesPerBlock); // can handle max 1 scramble per invocation
Debug.Assert(stripesSoFar < stripesPerBlock);
ulong stripesToEndOfBlock = stripesPerBlock - stripesSoFar;
if (stripesToEndOfBlock <= stripes)
{
// need a scrambling operation
ulong stripesAfterBlock = stripes - stripesToEndOfBlock;
Accumulate(accumulators, source, secret + ((int)stripesSoFar * SecretConsumeRateBytes), (int)stripesToEndOfBlock);
ScrambleAccumulators(accumulators, secret + (SecretLengthBytes - StripeLengthBytes));
Accumulate(accumulators, source + ((int)stripesToEndOfBlock * StripeLengthBytes), secret, (int)stripesAfterBlock);
stripesSoFar = stripesAfterBlock;
}
else
{
Accumulate(accumulators, source, secret + ((int)stripesSoFar * SecretConsumeRateBytes), (int)stripes);
stripesSoFar += stripes;
}
}
public static void Append(ref State state, ReadOnlySpan<byte> source)
{
Debug.Assert(state.BufferedCount <= InternalBufferLengthBytes);
state.TotalLength += (uint)source.Length;
fixed (byte* buffer = state.Buffer)
{
// Small input: just copy the data to the buffer.
if (source.Length <= InternalBufferLengthBytes - state.BufferedCount)
{
source.CopyTo(new Span<byte>(buffer + state.BufferedCount, source.Length));
state.BufferedCount += (uint)source.Length;
return;
}
fixed (byte* secret = state.Secret)
fixed (ulong* accumulators = state.Accumulators)
fixed (byte* sourcePtr = &MemoryMarshal.GetReference(source))
{
// Internal buffer is partially filled (always, except at beginning). Complete it, then consume it.
int sourceIndex = 0;
if (state.BufferedCount != 0)
{
int loadSize = InternalBufferLengthBytes - (int)state.BufferedCount;
source.Slice(0, loadSize).CopyTo(new Span<byte>(buffer + state.BufferedCount, loadSize));
sourceIndex = loadSize;
ConsumeStripes(accumulators, ref state.StripesProcessedInCurrentBlock, NumStripesPerBlock, buffer, InternalBufferStripes, secret);
state.BufferedCount = 0;
}
Debug.Assert(sourceIndex < source.Length);
// Large input to consume: ingest per full block.
if (source.Length - sourceIndex > NumStripesPerBlock * StripeLengthBytes)
{
ulong stripes = (ulong)(source.Length - sourceIndex - 1) / StripeLengthBytes;
Debug.Assert(NumStripesPerBlock >= state.StripesProcessedInCurrentBlock);
// Join to current block's end.
ulong stripesToEnd = NumStripesPerBlock - state.StripesProcessedInCurrentBlock;
Debug.Assert(stripesToEnd <= stripes);
Accumulate(accumulators, sourcePtr + sourceIndex, secret + ((int)state.StripesProcessedInCurrentBlock * SecretConsumeRateBytes), (int)stripesToEnd);
ScrambleAccumulators(accumulators, secret + (SecretLengthBytes - StripeLengthBytes));
state.StripesProcessedInCurrentBlock = 0;
sourceIndex += (int)stripesToEnd * StripeLengthBytes;
stripes -= stripesToEnd;
// Consume entire blocks.
while (stripes >= NumStripesPerBlock)
{
Accumulate(accumulators, sourcePtr + sourceIndex, secret, NumStripesPerBlock);
ScrambleAccumulators(accumulators, secret + (SecretLengthBytes - StripeLengthBytes));
sourceIndex += NumStripesPerBlock * StripeLengthBytes;
stripes -= NumStripesPerBlock;
}
// Consume complete stripes in the last partial block.
Accumulate(accumulators, sourcePtr + sourceIndex, secret, (int)stripes);
sourceIndex += (int)stripes * StripeLengthBytes;
Debug.Assert(sourceIndex < source.Length); // at least some bytes left
state.StripesProcessedInCurrentBlock = stripes;
// Copy the last stripe into the end of the buffer so it is available to GetCurrentHashCore when processing the "stripe from the end".
source.Slice(sourceIndex - StripeLengthBytes, StripeLengthBytes).CopyTo(new Span<byte>(buffer + InternalBufferLengthBytes - StripeLengthBytes, StripeLengthBytes));
}
else if (source.Length - sourceIndex > InternalBufferLengthBytes)
{
// Content to consume <= block size. Consume source by a multiple of internal buffer size.
do
{
ConsumeStripes(accumulators, ref state.StripesProcessedInCurrentBlock, NumStripesPerBlock, sourcePtr + sourceIndex, InternalBufferStripes, secret);
sourceIndex += InternalBufferLengthBytes;
}
while (source.Length - sourceIndex > InternalBufferLengthBytes);
// Copy the last stripe into the end of the buffer so it is available to GetCurrentHashCore when processing the "stripe from the end".
source.Slice(sourceIndex - StripeLengthBytes, StripeLengthBytes).CopyTo(new Span<byte>(buffer + InternalBufferLengthBytes - StripeLengthBytes, StripeLengthBytes));
}
// Buffer the remaining input.
Span<byte> remaining = new Span<byte>(buffer, source.Length - sourceIndex);
Debug.Assert(sourceIndex < source.Length);
Debug.Assert(remaining.Length <= InternalBufferLengthBytes);
Debug.Assert(state.BufferedCount == 0);
source.Slice(sourceIndex).CopyTo(remaining);
state.BufferedCount = (uint)remaining.Length;
}
}
}
public static void CopyAccumulators(ref State state, ulong* accumulators)
{
fixed (ulong* stateAccumulators = state.Accumulators)
{
#if NET8_0_OR_GREATER
if (Vector256.IsHardwareAccelerated)
{
Vector256.Store(Vector256.Load(stateAccumulators), accumulators);
Vector256.Store(Vector256.Load(stateAccumulators + 4), accumulators + 4);
}
else if (Vector128.IsHardwareAccelerated)
{
Vector128.Store(Vector128.Load(stateAccumulators), accumulators);
Vector128.Store(Vector128.Load(stateAccumulators + 2), accumulators + 2);
Vector128.Store(Vector128.Load(stateAccumulators + 4), accumulators + 4);
Vector128.Store(Vector128.Load(stateAccumulators + 6), accumulators + 6);
}
else
#endif
{
for (int i = 0; i < 8; i++)
{
accumulators[i] = stateAccumulators[i];
}
}
}
}
public static void DigestLong(ref State state, ulong* accumulators, byte* secret)
{
Debug.Assert(state.BufferedCount > 0);
fixed (byte* buffer = state.Buffer)
{
byte* accumulateData;
if (state.BufferedCount >= StripeLengthBytes)
{
uint stripes = (state.BufferedCount - 1) / StripeLengthBytes;
ulong stripesSoFar = state.StripesProcessedInCurrentBlock;
ConsumeStripes(accumulators, ref stripesSoFar, NumStripesPerBlock, buffer, stripes, secret);
accumulateData = buffer + state.BufferedCount - StripeLengthBytes;
}
else
{
byte* lastStripe = stackalloc byte[StripeLengthBytes];
int catchupSize = StripeLengthBytes - (int)state.BufferedCount;
new ReadOnlySpan<byte>(buffer + InternalBufferLengthBytes - catchupSize, catchupSize).CopyTo(new Span<byte>(lastStripe, StripeLengthBytes));
new ReadOnlySpan<byte>(buffer, (int)state.BufferedCount).CopyTo(new Span<byte>(lastStripe + catchupSize, (int)state.BufferedCount));
accumulateData = lastStripe;
}
Accumulate512(accumulators, accumulateData, secret + (SecretLengthBytes - StripeLengthBytes - SecretLastAccStartBytes));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void InitializeAccumulators(ulong* accumulators)
{
#if NET8_0_OR_GREATER
if (Vector256.IsHardwareAccelerated)
{
Vector256.Store(Vector256.Create(Prime32_3, Prime64_1, Prime64_2, Prime64_3), accumulators);
Vector256.Store(Vector256.Create(Prime64_4, Prime32_2, Prime64_5, Prime32_1), accumulators + 4);
}
else if (Vector128.IsHardwareAccelerated)
{
Vector128.Store(Vector128.Create(Prime32_3, Prime64_1), accumulators);
Vector128.Store(Vector128.Create(Prime64_2, Prime64_3), accumulators + 2);
Vector128.Store(Vector128.Create(Prime64_4, Prime32_2), accumulators + 4);
Vector128.Store(Vector128.Create(Prime64_5, Prime32_1), accumulators + 6);
}
else
#endif
{
accumulators[0] = Prime32_3;
accumulators[1] = Prime64_1;
accumulators[2] = Prime64_2;
accumulators[3] = Prime64_3;
accumulators[4] = Prime64_4;
accumulators[5] = Prime32_2;
accumulators[6] = Prime64_5;
accumulators[7] = Prime32_1;
}
}
public static ulong MergeAccumulators(ulong* accumulators, byte* secret, ulong start)
{
ulong result64 = start;
result64 += Multiply64To128ThenFold(accumulators[0] ^ ReadUInt64LE(secret), accumulators[1] ^ ReadUInt64LE(secret + 8));
result64 += Multiply64To128ThenFold(accumulators[2] ^ ReadUInt64LE(secret + 16), accumulators[3] ^ ReadUInt64LE(secret + 24));
result64 += Multiply64To128ThenFold(accumulators[4] ^ ReadUInt64LE(secret + 32), accumulators[5] ^ ReadUInt64LE(secret + 40));
result64 += Multiply64To128ThenFold(accumulators[6] ^ ReadUInt64LE(secret + 48), accumulators[7] ^ ReadUInt64LE(secret + 56));
return Avalanche(result64);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong Mix16Bytes(byte* source, ulong secretLow, ulong secretHigh, ulong seed) =>
Multiply64To128ThenFold(
ReadUInt64LE(source) ^ (secretLow + seed),
ReadUInt64LE(source + sizeof(ulong)) ^ (secretHigh - seed));
/// <summary>Calculates a 32-bit to 64-bit long multiply.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong Multiply32To64(uint v1, uint v2) => (ulong)v1 * v2;
/// <summary>"This is a fast avalanche stage, suitable when input bits are already partially mixed."</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong Avalanche(ulong hash)
{
hash = XorShift(hash, 37);
hash *= 0x165667919E3779F9;
hash = XorShift(hash, 32);
return hash;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong Multiply64To128(ulong left, ulong right, out ulong lower)
{
#if NET5_0_OR_GREATER
return Math.BigMul(left, right, out lower);
#else
ulong lowerLow = Multiply32To64((uint)left, (uint)right);
ulong higherLow = Multiply32To64((uint)(left >> 32), (uint)right);
ulong lowerHigh = Multiply32To64((uint)left, (uint)(right >> 32));
ulong higherHigh = Multiply32To64((uint)(left >> 32), (uint)(right >> 32));
ulong cross = (lowerLow >> 32) + (higherLow & 0xFFFFFFFF) + lowerHigh;
ulong upper = (higherLow >> 32) + (cross >> 32) + higherHigh;
lower = (cross << 32) | (lowerLow & 0xFFFFFFFF);
return upper;
#endif
}
/// <summary>Calculates a 64-bit to 128-bit multiply, then XOR folds it.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong Multiply64To128ThenFold(ulong left, ulong right)
{
ulong upper = Multiply64To128(left, right, out ulong lower);
return lower ^ upper;
}
public static void DeriveSecretFromSeed(byte* destinationSecret, ulong seed)
{
fixed (byte* defaultSecret = &MemoryMarshal.GetReference(DefaultSecret))
{
#if NET8_0_OR_GREATER
if (Vector256.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
Vector256<ulong> seedVec = Vector256.Create(seed, 0u - seed, seed, 0u - seed);
for (int i = 0; i < SecretLengthBytes; i += Vector256<byte>.Count)
{
Vector256.Store(Vector256.Load((ulong*)(defaultSecret + i)) + seedVec, (ulong*)(destinationSecret + i));
}
}
else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
Vector128<ulong> seedVec = Vector128.Create(seed, 0u - seed);
for (int i = 0; i < SecretLengthBytes; i += Vector128<byte>.Count)
{
Vector128.Store(Vector128.Load((ulong*)(defaultSecret + i)) + seedVec, (ulong*)(destinationSecret + i));
}
}
else
#endif
{
for (int i = 0; i < SecretLengthBytes; i += sizeof(ulong) * 2)
{
WriteUInt64LE(destinationSecret + i, ReadUInt64LE(defaultSecret + i) + seed);
WriteUInt64LE(destinationSecret + i + 8, ReadUInt64LE(defaultSecret + i + 8) - seed);
}
}
}
}
/// <summary>Optimized version of looping over <see cref="Accumulate512"/>.</summary>
[MethodImpl(MethodImplOptions.NoInlining)]
private static void Accumulate(ulong* accumulators, byte* source, byte* secret, int stripesToProcess, bool scramble = false, int blockCount = 1)
{
byte* secretForAccumulate = secret;
byte* secretForScramble = secret + (SecretLengthBytes - StripeLengthBytes);
#if NET8_0_OR_GREATER
if (Vector256.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
Vector256<ulong> acc1 = Vector256.Load(accumulators);
Vector256<ulong> acc2 = Vector256.Load(accumulators + Vector256<ulong>.Count);
for (int j = 0; j < blockCount; j++)
{
secret = secretForAccumulate;
for (int i = 0; i < stripesToProcess; i++)
{
Vector256<uint> secretVal = Vector256.Load((uint*)secret);
acc1 = Accumulate256(acc1, source, secretVal);
source += Vector256<byte>.Count;
secretVal = Vector256.Load((uint*)secret + Vector256<uint>.Count);
acc2 = Accumulate256(acc2, source, secretVal);
source += Vector256<byte>.Count;
secret += SecretConsumeRateBytes;
}
if (scramble)
{
acc1 = ScrambleAccumulator256(acc1, Vector256.Load((ulong*)secretForScramble));
acc2 = ScrambleAccumulator256(acc2, Vector256.Load((ulong*)secretForScramble + Vector256<ulong>.Count));
}
}
Vector256.Store(acc1, accumulators);
Vector256.Store(acc2, accumulators + Vector256<ulong>.Count);
}
else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
Vector128<ulong> acc1 = Vector128.Load(accumulators);
Vector128<ulong> acc2 = Vector128.Load(accumulators + Vector128<ulong>.Count);
Vector128<ulong> acc3 = Vector128.Load(accumulators + Vector128<ulong>.Count * 2);
Vector128<ulong> acc4 = Vector128.Load(accumulators + Vector128<ulong>.Count * 3);
for (int j = 0; j < blockCount; j++)
{
secret = secretForAccumulate;
for (int i = 0; i < stripesToProcess; i++)
{
Vector128<uint> secretVal = Vector128.Load((uint*)secret);
acc1 = Accumulate128(acc1, source, secretVal);
source += Vector128<byte>.Count;
secretVal = Vector128.Load((uint*)secret + Vector128<uint>.Count);
acc2 = Accumulate128(acc2, source, secretVal);
source += Vector128<byte>.Count;
secretVal = Vector128.Load((uint*)secret + Vector128<uint>.Count * 2);
acc3 = Accumulate128(acc3, source, secretVal);
source += Vector128<byte>.Count;
secretVal = Vector128.Load((uint*)secret + Vector128<uint>.Count * 3);
acc4 = Accumulate128(acc4, source, secretVal);
source += Vector128<byte>.Count;
secret += SecretConsumeRateBytes;
}
if (scramble)
{
acc1 = ScrambleAccumulator128(acc1, Vector128.Load((ulong*)secretForScramble));
acc2 = ScrambleAccumulator128(acc2, Vector128.Load((ulong*)secretForScramble + Vector128<ulong>.Count));
acc3 = ScrambleAccumulator128(acc3, Vector128.Load((ulong*)secretForScramble + Vector128<ulong>.Count * 2));
acc4 = ScrambleAccumulator128(acc4, Vector128.Load((ulong*)secretForScramble + Vector128<ulong>.Count * 3));
}
}
Vector128.Store(acc1, accumulators);
Vector128.Store(acc2, accumulators + Vector128<ulong>.Count);
Vector128.Store(acc3, accumulators + Vector128<ulong>.Count * 2);
Vector128.Store(acc4, accumulators + Vector128<ulong>.Count * 3);
}
else
#endif
{
for (int j = 0; j < blockCount; j++)
{
for (int i = 0; i < stripesToProcess; i++)
{
Accumulate512Inlined(accumulators, source, secret + (i * SecretConsumeRateBytes));
source += StripeLengthBytes;
}
if (scramble)
{
ScrambleAccumulators(accumulators, secretForScramble);
}
}
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public static void Accumulate512(ulong* accumulators, byte* source, byte* secret)
{
Accumulate512Inlined(accumulators, source, secret);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Accumulate512Inlined(ulong* accumulators, byte* source, byte* secret)
{
#if NET8_0_OR_GREATER
if (Vector256.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
for (int i = 0; i < AccumulatorCount / Vector256<ulong>.Count; i++)
{
Vector256<ulong> accVec = Accumulate256(Vector256.Load(accumulators), source, Vector256.Load((uint*)secret));
Vector256.Store(accVec, accumulators);
accumulators += Vector256<ulong>.Count;
secret += Vector256<byte>.Count;
source += Vector256<byte>.Count;
}
}
else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
for (int i = 0; i < AccumulatorCount / Vector128<ulong>.Count; i++)
{
Vector128<ulong> accVec = Accumulate128(Vector128.Load(accumulators), source, Vector128.Load((uint*)secret));
Vector128.Store(accVec, accumulators);
accumulators += Vector128<ulong>.Count;
secret += Vector128<byte>.Count;
source += Vector128<byte>.Count;
}
}
else
#endif
{
for (int i = 0; i < AccumulatorCount; i++)
{
ulong sourceVal = ReadUInt64LE(source + (8 * i));
ulong sourceKey = sourceVal ^ ReadUInt64LE(secret + (i * 8));
accumulators[i ^ 1] += sourceVal; // swap adjacent lanes
accumulators[i] += Multiply32To64((uint)sourceKey, (uint)(sourceKey >> 32));
}
}
}
#if NET8_0_OR_GREATER
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<ulong> Accumulate256(Vector256<ulong> accVec, byte* source, Vector256<uint> secret)
{
Vector256<uint> sourceVec = Vector256.Load((uint*)source);
Vector256<uint> sourceKey = sourceVec ^ secret;
// TODO: Figure out how to unwind this shuffle and just use Vector256.Multiply
Vector256<uint> sourceKeyLow = Vector256.Shuffle(sourceKey, Vector256.Create(1u, 0, 3, 0, 5, 0, 7, 0));
Vector256<uint> sourceSwap = Vector256.Shuffle(sourceVec, Vector256.Create(2u, 3, 0, 1, 6, 7, 4, 5));
Vector256<ulong> sum = accVec + sourceSwap.AsUInt64();
Vector256<ulong> product = Avx2.IsSupported ?
Avx2.Multiply(sourceKey, sourceKeyLow) :
(sourceKey & Vector256.Create(~0u, 0u, ~0u, 0u, ~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceKeyLow & Vector256.Create(~0u, 0u, ~0u, 0u, ~0u, 0u, ~0u, 0u)).AsUInt64();
accVec = product + sum;
return accVec;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<ulong> Accumulate128(Vector128<ulong> accVec, byte* source, Vector128<uint> secret)
{
Vector128<uint> sourceVec = Vector128.Load((uint*)source);
Vector128<uint> sourceKey = sourceVec ^ secret;
// TODO: Figure out how to unwind this shuffle and just use Vector128.Multiply
Vector128<uint> sourceSwap = Vector128.Shuffle(sourceVec, Vector128.Create(2u, 3, 0, 1));
Vector128<ulong> sum = accVec + sourceSwap.AsUInt64();
Vector128<ulong> product = MultiplyWideningLower(sourceKey);
accVec = product + sum;
return accVec;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<ulong> MultiplyWideningLower(Vector128<uint> source)
{
if (AdvSimd.IsSupported)
{
Vector64<uint> sourceLow = Vector128.Shuffle(source, Vector128.Create(0u, 2, 0, 0)).GetLower();
Vector64<uint> sourceHigh = Vector128.Shuffle(source, Vector128.Create(1u, 3, 0, 0)).GetLower();
return AdvSimd.MultiplyWideningLower(sourceLow, sourceHigh);
}
else
{
Vector128<uint> sourceLow = Vector128.Shuffle(source, Vector128.Create(1u, 0, 3, 0));
return Sse2.IsSupported ?
Sse2.Multiply(source, sourceLow) :
(source & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceLow & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64();
}
}
#endif
private static void ScrambleAccumulators(ulong* accumulators, byte* secret)
{
#if NET8_0_OR_GREATER
if (Vector256.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
for (int i = 0; i < AccumulatorCount / Vector256<ulong>.Count; i++)
{
Vector256<ulong> accVec = ScrambleAccumulator256(Vector256.Load(accumulators), Vector256.Load((ulong*)secret));
Vector256.Store(accVec, accumulators);
accumulators += Vector256<ulong>.Count;
secret += Vector256<byte>.Count;
}
}
else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
{
for (int i = 0; i < AccumulatorCount / Vector128<ulong>.Count; i++)
{
Vector128<ulong> accVec = ScrambleAccumulator128(Vector128.Load(accumulators), Vector128.Load((ulong*)secret));
Vector128.Store(accVec, accumulators);
accumulators += Vector128<ulong>.Count;
secret += Vector128<byte>.Count;
}
}
else
#endif
{
for (int i = 0; i < AccumulatorCount; i++)
{
ulong xorShift = XorShift(*accumulators, 47);
ulong xorWithKey = xorShift ^ ReadUInt64LE(secret);
*accumulators = xorWithKey * Prime32_1;
accumulators++;
secret += sizeof(ulong);
}
}
}
#if NET8_0_OR_GREATER
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<ulong> ScrambleAccumulator256(Vector256<ulong> accVec, Vector256<ulong> secret)
{
Vector256<ulong> xorShift = accVec ^ Vector256.ShiftRightLogical(accVec, 47);
Vector256<ulong> xorWithKey = xorShift ^ secret;
accVec = xorWithKey * Vector256.Create((ulong)Prime32_1);
return accVec;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<ulong> ScrambleAccumulator128(Vector128<ulong> accVec, Vector128<ulong> secret)
{
Vector128<ulong> xorShift = accVec ^ Vector128.ShiftRightLogical(accVec, 47);
Vector128<ulong> xorWithKey = xorShift ^ secret;
accVec = xorWithKey * Vector128.Create((ulong)Prime32_1);
return accVec;
}
#endif
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong XorShift(ulong value, int shift)
{
Debug.Assert(shift >= 0 && shift < 64);
return value ^ (value >> shift);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint ReadUInt32LE(byte* data) =>
BitConverter.IsLittleEndian ?
Unsafe.ReadUnaligned<uint>(data) :
BinaryPrimitives.ReverseEndianness(Unsafe.ReadUnaligned<uint>(data));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong ReadUInt64LE(byte* data) =>
BitConverter.IsLittleEndian ?
Unsafe.ReadUnaligned<ulong>(data) :
BinaryPrimitives.ReverseEndianness(Unsafe.ReadUnaligned<ulong>(data));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void WriteUInt64LE(byte* data, ulong value)
{
if (!BitConverter.IsLittleEndian)
{
value = BinaryPrimitives.ReverseEndianness(value);
}
Unsafe.WriteUnaligned(data, value);
}
[StructLayout(LayoutKind.Auto)]
public struct State
{
/// <summary>The accumulators. Length is <see cref="AccumulatorCount"/>.</summary>
internal fixed ulong Accumulators[AccumulatorCount];
/// <summary>Used to store a custom secret generated from a seed. Length is <see cref="SecretLengthBytes"/>.</summary>
internal fixed byte Secret[SecretLengthBytes];
/// <summary>The internal buffer. Length is <see cref="InternalBufferLengthBytes"/>.</summary>
internal fixed byte Buffer[InternalBufferLengthBytes];
/// <summary>The amount of memory in <see cref="Buffer"/>.</summary>
internal uint BufferedCount;
/// <summary>Number of stripes processed in the current block.</summary>
internal ulong StripesProcessedInCurrentBlock;
/// <summary>Total length hashed.</summary>
internal ulong TotalLength;
/// <summary>The seed employed (possibly 0).</summary>
internal ulong Seed;
};
}
}
|