File: DataViewRowId.cs
Web Access
Project: src\src\Microsoft.ML.DataView\Microsoft.ML.DataView.csproj (Microsoft.ML.DataView)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
 
namespace Microsoft.ML.Data
{
    /// <summary>
    /// A structure serving as the identifier of a row of <see cref="IDataView"/>.
    /// For datasets with millions of records, those IDs need to be unique, therefore the need for such a large structure to hold the values.
    /// Those Ids are derived from other Ids of the previous components of the pipelines, and dividing the structure in two: high order and low order of bits,
    /// and reduces the changes of those collisions even further.
    /// </summary>
    /// <seealso cref="DataViewRow.GetIdGetter"/>
    public readonly struct DataViewRowId : IComparable<DataViewRowId>, IEquatable<DataViewRowId>
    {
        ///<summary>The low order bits. Corresponds to H1 in the Murmur algorithms.</summary>
        public readonly ulong Low;
 
        ///<summary> The high order bits. Corresponds to H2 in the Murmur algorithms.</summary>
        public readonly ulong High;
 
        /// <summary>
        /// Initializes a new instance of <see cref="DataViewRowId"/>
        /// </summary>
        /// <param name="low">The low order <langword>ulong</langword>.</param>
        /// <param name="high">The high order <langword>ulong</langword>.</param>
        public DataViewRowId(ulong low, ulong high)
        {
            Low = low;
            High = high;
        }
 
        public override string ToString()
        {
            // Since H1 are the low order bits, they are printed second.
            return string.Format("{0:x16}{1:x16}", High, Low);
        }
 
        public int CompareTo(DataViewRowId other)
        {
            int result = High.CompareTo(other.High);
            return result == 0 ? Low.CompareTo(other.Low) : result;
        }
 
        public bool Equals(DataViewRowId other)
        {
            return Low == other.Low && High == other.High;
        }
 
        public override bool Equals(object obj)
        {
            if (obj is DataViewRowId other)
            {
                return Equals(other);
            }
            return false;
        }
 
        public override int GetHashCode()
        {
            return (int)(
                (uint)Low ^ (uint)(Low >> 32) ^
                (uint)(High << 7) ^ (uint)(High >> 57) ^ (uint)(High >> (57 - 32)));
        }
 
        #region Hashing style
 
        // This is adapted from reference Murmur3 128-bit implementation at
        // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
        // The transformation operations do not mix the result, since it only necessary
        // that the result be unique, not that its bits have any sort of independence
        // or other distributional characteristics.
 
        private const ulong _c1 = 0x87c37b91114253d5;
        private const ulong _c2 = 0x4cf5ad432745937f;
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static void RotL(ref ulong x, int r)
        {
            Debug.Assert(0 < r && r < 64);
            x = (x << r) | (x >> (64 - r));
        }
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static ulong RotL(ulong x, int r)
        {
            Debug.Assert(0 < r && r < 64);
            return (x << r) | (x >> (64 - r));
        }
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static void FMix(ref ulong k)
        {
            k ^= k >> 33;
            k *= 0xff51afd7ed558ccd;
            k ^= k >> 33;
            k *= 0xc4ceb9fe1a85ec53;
            k ^= k >> 33;
        }
 
        private static void FinalMix(ref ulong h1, ref ulong h2, int len)
        {
            h1 ^= (ulong)len;
            h2 ^= (ulong)len;
            h1 += h2;
            h2 += h1;
            FMix(ref h1);
            FMix(ref h2);
            h1 += h2;
            h2 += h1;
        }
 
        /// <summary>
        /// An operation that treats the value as an unmixed Murmur3 128-bit hash state,
        /// and returns the hash state that would result if we hashed an addition 16 bytes
        /// that were all zeros, except for the last bit which is one.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public DataViewRowId Fork()
        {
            ulong h1 = Low;
            ulong h2 = High;
            // Here it's as if k1=1, k2=0.
            h1 = RotL(h1, 27);
            h1 += h2;
            h1 = h1 * 5 + 0x52dce729;
            h2 = RotL(h2, 31);
            h2 += h1;
            h2 = h2 * 5 + 0x38495ab5;
            h1 ^= RotL(_c1, 31) * _c2;
            return new DataViewRowId(h1, h2);
        }
 
        /// <summary>
        /// An operation that treats the value as an unmixed Murmur3 128-bit hash state,
        /// and returns the hash state that would result if we hashed an addition 16 bytes
        /// that were all zeros.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public DataViewRowId Next()
        {
            ulong h1 = Low;
            ulong h2 = High;
            // Here it's as if k1=0, k2=0.
            h1 = RotL(h1, 27);
            h1 += h2;
            h1 = h1 * 5 + 0x52dce729;
            h2 = RotL(h2, 31);
            h2 += h1;
            h2 = h2 * 5 + 0x38495ab5;
            return new DataViewRowId(h1, h2);
        }
 
        /// <summary>
        /// An operation that treats the value as an unmixed Murmur3 128-bit hash state,
        /// and returns the hash state that would result if we took <paramref name="other"/>,
        /// scrambled it using <see cref="Fork"/>, then hashed the result of that.
        /// </summary>
        /// <param name="other"></param>
        /// <returns></returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public DataViewRowId Combine(DataViewRowId other)
        {
            var h1 = Low;
            var h2 = High;
 
            other = other.Fork();
            ulong k1 = other.Low; // First 8 bytes.
            ulong k2 = other.High; // Second 8 bytes.
 
            k1 *= _c1;
            k1 = RotL(k1, 31);
            k1 *= _c2;
            h1 ^= k1;
            h1 = RotL(h1, 27);
            h1 += h2;
            h1 = h1 * 5 + 0x52dce729;
 
            k2 *= _c2;
            k2 = RotL(k2, 33);
            k2 *= _c1;
            h2 ^= k2;
            h2 = RotL(h2, 31);
            h2 += h1;
            h2 = h2 * 5 + 0x38495ab5;
 
            return new DataViewRowId(h1, h2);
        }
        #endregion
    }
}