|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Runtime.CompilerServices;
using System.Text;
using Microsoft.ML.Runtime;
namespace Microsoft.ML.Internal.Utilities
{
[BestFriend]
internal static class Hashing
{
private const uint _defaultSeed = (5381 << 16) + 5381;
public static uint CombineHash(uint u1, uint u2)
{
return ((u1 << 7) | (u1 >> 25)) ^ u2;
}
public static int CombineHash(int n1, int n2)
{
return (int)CombineHash((uint)n1, (uint)n2);
}
/// <summary>
/// Creates a combined hash of possibly heterogeneously typed values.
/// </summary>
/// <param name="startHash">The leading hash, incorporated into the final hash</param>
/// <param name="os">A variable list of objects, where null is a valid value</param>
/// <returns>The combined hash incorporating a starting hash, and the hash codes
/// of all input values</returns>
public static int CombinedHash(int startHash, params object[] os)
{
foreach (object o in os)
startHash = CombineHash(startHash, o == null ? 0 : o.GetHashCode());
return startHash;
}
/// <summary>
/// Creates a combined hash of multiple homogeneous typed non-null values.
/// </summary>
/// <typeparam name="T">The type of items to hash</typeparam>
/// <param name="startHash">The leading hash, incorporated into the final hash</param>
/// <param name="os">A variable list of non-null values</param>
/// <returns>The combined hash incorporating a starting hash, and the hash codes
/// of all input values</returns>
public static int CombinedHash<T>(int startHash, params T[] os)
{
foreach (T o in os)
startHash = CombineHash(startHash, o.GetHashCode());
return startHash;
}
public static uint HashUint(uint u)
{
ulong uu = (ulong)u * 0x7ff19519UL; // this number is prime.
return Utils.GetLo(uu) + Utils.GetHi(uu);
}
public static int HashInt(int n)
{
return (int)HashUint((uint)n);
}
/// <summary>
/// Hash the characters in a <see cref="ReadOnlySpan{T}"/> of <see cref="char"/>.
/// This MUST produce the same result as the other overloads (with equivalent characters).
/// </summary>
public static uint HashString(ReadOnlySpan<char> str) => MurmurHash(_defaultSeed, str);
/// <summary>
/// Hash the characters in a string builder. This MUST produce the same result
/// as HashString(sb.ToString()).
/// </summary>
public static uint HashString(StringBuilder sb)
{
Contracts.AssertValue(sb);
return MurmurHash(_defaultSeed, sb, 0, sb.Length);
}
public static uint HashSequence(uint[] sequence, int min, int lim)
{
return MurmurHash(_defaultSeed, sequence, min, lim);
}
/// <summary>
/// Combines the given hash value with a uint value, using the murmur hash 3 algorithm.
/// Make certain to also use <see cref="MixHash(uint)"/> or <see cref="MixHash(uint, int)"/> on the final hashed value,
/// if you depend upon having distinct bits.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint MurmurRound(uint hash, uint chunk)
{
chunk *= 0xCC9E2D51;
chunk = Rotate(chunk, 15);
chunk *= 0x1B873593;
hash ^= chunk;
hash = Rotate(hash, 13);
hash *= 5;
hash += 0xE6546B64;
return hash;
}
/// <summary>
/// Implements the murmur hash 3 algorithm, using a mock UTF-8 encoding.
/// The UTF-8 conversion ignores the possibilities of unicode planes other than the 0th.
/// That is, it simply converts char values to one, two, or three bytes according to
/// the following rules:
/// * 0x0000 to 0x007F : 0xxxxxxx
/// * 0x0080 to 0x07FF : 110xxxxx 10xxxxxx
/// * 0x0800 to 0xFFFF : 1110xxxx 10xxxxxx 10xxxxxx
/// NOTE: This MUST match the StringBuilder version below.
/// </summary>
public static uint MurmurHash(uint hash, ReadOnlySpan<char> span, bool toUpper = false)
{
// Byte length (in pseudo UTF-8 form).
int len = 0;
// Current bits, value and count.
ulong cur = 0;
int bits = 0;
for (int ich = 0; ich < span.Length; ich++)
{
Contracts.Assert((bits & 0x7) == 0);
Contracts.Assert((uint)bits <= 24);
Contracts.Assert(cur <= 0x00FFFFFF);
uint ch = toUpper ? char.ToUpperInvariant(span[ich]) : span[ich];
if (ch <= 0x007F)
{
cur |= ch << bits;
bits += 8;
}
else if (ch <= 0x07FF)
{
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x1F00) | 0xC080) << bits;
bits += 16;
}
else
{
Contracts.Assert(ch <= 0xFFFF);
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x0F0000) | 0xE08080) << bits;
bits += 24;
}
if (bits >= 32)
{
hash = MurmurRound(hash, (uint)cur);
cur = cur >> 32;
bits -= 32;
len += 4;
}
}
Contracts.Assert((bits & 0x7) == 0);
Contracts.Assert((uint)bits <= 24);
Contracts.Assert(cur <= 0x00FFFFFF);
if (bits > 0)
{
hash = MurmurRound(hash, (uint)cur);
len += bits / 8;
}
// Encode the length.
hash = MurmurRound(hash, (uint)len);
// Final mixing ritual for the hash.
hash = MixHash(hash);
return hash;
}
// MurmurHashV2 is an implementation of MurmurHash3_x86_32 (https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp#L94) used by Onnxruntime's
// MurmurHash operator, implemented to have matching hashing results between ORT and ML.NET.
// One key difference between the two implementations is that strings use a different hashing algorithm in the previous version, but
// every type uses the same implementation on V2.
// The V1 String Hashing Algorithm had the following properities:
// - Case Conversion: used inside the hashing algorithm in ML.Net.
// - Mock UTF8 encoding: strings in C# are UTF16 and need to be converted to UTF8 before hashing
public static uint MurmurHashV2(uint hash, ReadOnlySpan<char> span, bool toUpper = false)
{
// Byte length (in pseudo UTF-8 form).
int len = 0;
// Current bits, value and count.
ulong cur = 0;
int bits = 0;
for (int ich = 0; ich < span.Length; ich++)
{
Contracts.Assert((bits & 0x7) == 0);
Contracts.Assert((uint)bits <= 24);
Contracts.Assert(cur <= 0x00FFFFFF);
uint ch = toUpper ? char.ToUpperInvariant(span[ich]) : span[ich];
if (ch <= 0x007F)
{
cur |= ch << bits;
bits += 8;
}
else if (ch <= 0x07FF)
{
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x1F00) | 0xC080) << bits;
cur = (cur & 0xFF) << 8 | cur >> 8;
bits += 16;
}
else if (ch <= 0xFFFF)
{
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x0F0000) | 0xE08080) << bits;
cur = (cur & 0xFF) << 16 | ((cur >> 8) & 0xFF) << 8 | cur >> 16;
bits += 24;
}
else
{
Contracts.Assert(ch <= 0x10FFFF);
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x3F0000) | ((ch << 6) & 0x07000000) | 0xF0808080) << bits;
cur = (cur & 0xFF) << 24 | ((cur >> 8) & 0xFF) << 16 | ((cur >> 16) & 0xFF) << 8 | cur >> 24;
bits += 32;
}
if (bits >= 32)
{
hash = MurmurRound(hash, (uint)cur);
cur = cur >> 32;
bits -= 32;
len += 4;
}
}
Contracts.Assert((bits & 0x7) == 0);
Contracts.Assert((uint)bits <= 24);
Contracts.Assert(cur <= 0x00FFFFFF);
if (bits > 0)
{
len += bits / 8;
}
// tail processing
uint k1 = 0;
switch (len & 3)
{
case 3:
k1 ^= (uint)(((cur >> 16) & 0xFF) << 16);
goto case 2;
case 2:
k1 ^= (uint)((cur >> 8) & 0xFF) << 8;
goto case 1;
case 1:
k1 ^= (uint)(cur & 0xFF);
k1 *= 0xCC9E2D51; k1 = Rotate(k1, 15);
k1 *= 0x1B873593;
hash ^= k1;
break;
}
// Final mixing ritual for the hash.
hash = MixHash(hash, len);
return hash;
}
/// <summary>
/// Implements the murmur hash 3 algorithm, using a mock UTF-8 encoding.
/// The UTF-8 conversion ignores the possibilities of unicode planes other than the 0th.
/// That is, it simply converts char values to one, two, or three bytes according to
/// the following rules:
/// * 0x0000 to 0x007F : 0xxxxxxx
/// * 0x0080 to 0x07FF : 110xxxxx 10xxxxxx
/// * 0x0800 to 0xFFFF : 1110xxxx 10xxxxxx 10xxxxxx
/// NOTE: This MUST match the string version above.
/// </summary>
public static uint MurmurHash(uint hash, StringBuilder data, int ichMin, int ichLim, bool toUpper = false)
{
Contracts.Assert(0 <= ichMin && ichMin <= ichLim && ichLim <= Utils.Size(data));
uint seed = hash;
// Byte length (in pseudo UTF-8 form).
int len = 0;
// Current bits, value and count.
ulong cur = 0;
int bits = 0;
for (int ich = ichMin; ich < ichLim; ich++)
{
Contracts.Assert((bits & 0x7) == 0);
Contracts.Assert((uint)bits <= 24);
Contracts.Assert(cur <= 0x00FFFFFF);
uint ch = toUpper ? char.ToUpperInvariant(data[ich]) : data[ich];
if (ch <= 0x007F)
{
cur |= ch << bits;
bits += 8;
}
else if (ch <= 0x07FF)
{
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x1F00) | 0xC080) << bits;
bits += 16;
}
else
{
Contracts.Assert(ch <= 0xFFFF);
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x0F0000) | 0xE08080) << bits;
bits += 24;
}
if (bits >= 32)
{
hash = MurmurRound(hash, (uint)cur);
cur = cur >> 32;
bits -= 32;
len += 4;
}
}
Contracts.Assert((bits & 0x7) == 0);
Contracts.Assert((uint)bits <= 24);
Contracts.Assert(cur <= 0x00FFFFFF);
if (bits > 0)
{
hash = MurmurRound(hash, (uint)cur);
len += bits / 8;
}
// Encode the length.
hash = MurmurRound(hash, (uint)len);
// Final mixing ritual for the hash.
hash = MixHash(hash);
Contracts.Assert(hash == MurmurHash(seed, data.ToString().AsSpan()));
return hash;
}
/// <summary>
/// Performs a MurmurRound on each int in the sequence
/// </summary>
public static uint MurmurHash(uint hash, uint[] data, int min, int lim)
{
Contracts.Check(0 <= min);
Contracts.Check(min <= lim);
Contracts.Check(lim <= Utils.Size(data));
for (int i = min; i < lim; i++)
hash = MurmurRound(hash, data[i]);
hash = MurmurRound(hash, (uint)(lim - min));
// Final mixing ritual for the hash.
hash = MixHash(hash);
return hash;
}
/// <summary>
/// The final mixing ritual for the Murmur3 hashing algorithm. Most users of
/// <see cref="MurmurRound"/> will want to close their progressive building of
/// a hash with a call to this method.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint MixHash(uint hash)
{
hash ^= hash >> 16;
hash *= 0x85ebca6b;
hash ^= hash >> 13;
hash *= 0xc2b2ae35;
hash ^= hash >> 16;
return hash;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint MixHash(uint hash, int len)
{
hash ^= (uint)len;
hash ^= hash >> 16;
hash *= 0x85ebca6b;
hash ^= hash >> 13;
hash *= 0xc2b2ae35;
hash ^= hash >> 16;
return hash;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint Rotate(uint x, int r)
{
return (x << r) | (x >> (32 - r));
}
}
}
|