File: System\Collections\Frozen\FrozenHashTable.cs
Web Access
Project: src\src\libraries\System.Collections.Immutable\src\System.Collections.Immutable.csproj (System.Collections.Immutable)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers;
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
 
namespace System.Collections.Frozen
{
    /// <summary>Provides the core hash table for use in frozen collections.</summary>
    /// <remarks>
    /// This hash table doesn't track any of the collection state. It merely keeps track
    /// of hash codes and of mapping these hash codes to spans of entries within the collection.
    /// </remarks>
    internal readonly struct FrozenHashTable
    {
        private readonly Bucket[] _buckets;
        private readonly ulong _fastModMultiplier;
 
        /// <summary>Initializes the hashtable with the computed hashcodes and bucket information.</summary>
        /// <param name="hashCodes">The array of hashcodes grouped into contiguous regions by bucket. Each bucket is one and only one region of the array.</param>
        /// <param name="buckets">
        /// The array of buckets, indexed by hashCodes % buckets.Length, where each bucket is
        /// the start/end index into <paramref name="hashCodes"/> for all items in that bucket.
        /// </param>
        /// <param name="fastModMultiplier">The multiplier to use as part of a FastMod method call.</param>
        private FrozenHashTable(int[] hashCodes, Bucket[] buckets, ulong fastModMultiplier)
        {
            Debug.Assert(hashCodes.Length != 0);
            Debug.Assert(buckets.Length != 0);
 
            HashCodes = hashCodes;
            _buckets = buckets;
            _fastModMultiplier = fastModMultiplier;
        }
 
        /// <summary>Initializes a frozen hash table.</summary>
        /// <param name="hashCodes">Pre-calculated hash codes. When the method finishes, it assigns each value to destination index.</param>
        /// <param name="hashCodesAreUnique">True when the input hash codes are already unique (provided from a dictionary of integers for example).</param>
        /// <remarks>
        /// It will then determine the optimal number of hash buckets to allocate and will populate the
        /// bucket table. The caller is responsible to consume the values written to <paramref name="hashCodes"/> and update the destination (if desired).
        /// <see cref="FindMatchingEntries(int, out int, out int)"/> then uses this index to reference individual entries by indexing into <see cref="HashCodes"/>.
        /// </remarks>
        /// <returns>A frozen hash table.</returns>
        public static FrozenHashTable Create(Span<int> hashCodes, bool hashCodesAreUnique = false)
        {
            // Determine how many buckets to use.  This might be fewer than the number of entries
            // if any entries have identical hashcodes (not just different hashcodes that might
            // map to the same bucket).
            int numBuckets = CalcNumBuckets(hashCodes, hashCodesAreUnique);
            ulong fastModMultiplier = HashHelpers.GetFastModMultiplier((uint)numBuckets);
 
            // Create two spans:
            // - bucketStarts: initially filled with all -1s, the ith element stores the index
            //   into hashCodes of the head element of that bucket's chain.
            // - nexts: the ith element stores the index of the next item in the chain.
            int[] arrayPoolBuckets = ArrayPool<int>.Shared.Rent(numBuckets + hashCodes.Length);
            Span<int> bucketStarts = arrayPoolBuckets.AsSpan(0, numBuckets);
            Span<int> nexts = arrayPoolBuckets.AsSpan(numBuckets, hashCodes.Length);
            bucketStarts.Fill(-1);
 
            // Populate the bucket entries and starts.  For each hash code, compute its bucket,
            // and store at the bucket entry corresponding to the hashcode item the entry for that
            // item, which includes a copy of the hash code and the current bucket start, which
            // is then replaced by this entry as it's pushed into the bucket list.
            for (int index = 0; index < hashCodes.Length; index++)
            {
                int hashCode = hashCodes[index];
                int bucketNum = (int)HashHelpers.FastMod((uint)hashCode, (uint)bucketStarts.Length, fastModMultiplier);
 
                ref int bucketStart = ref bucketStarts[bucketNum];
                nexts[index] = bucketStart;
                bucketStart = index;
            }
 
            // Write out the hashcodes and buckets arrays to be used by the FrozenHashtable instance.
            // We iterate through each bucket start, and from each, each item in that chain, writing
            // out all of the items in each chain next to each other in the hashcodes list (and
            // calling the setter to allow the consumer to reorder its entries appropriately).
            // Along the way we could how many items are in each chain, and use that along with
            // the starting index to write out the bucket information for indexing into hashcodes.
            var hashtableHashcodes = new int[hashCodes.Length];
            var hashtableBuckets = new Bucket[bucketStarts.Length];
            int count = 0;
            for (int bucketNum = 0; bucketNum < hashtableBuckets.Length; bucketNum++)
            {
                int bucketStart = bucketStarts[bucketNum];
                if (bucketStart < 0)
                {
                    continue;
                }
 
                int bucketCount = 0;
                int index = bucketStart;
                bucketStart = count;
                while (index >= 0)
                {
                    ref int hashCode = ref hashCodes[index];
                    hashtableHashcodes[count] = hashCode;
                    // we have used the hash code for the last time, now we re-use the buffer to store destination index
                    hashCode = count;
                    count++;
                    bucketCount++;
 
                    index = nexts[index];
                }
 
                hashtableBuckets[bucketNum] = new Bucket(bucketStart, bucketCount);
            }
            Debug.Assert(count == hashtableHashcodes.Length);
 
            ArrayPool<int>.Shared.Return(arrayPoolBuckets);
 
            return new FrozenHashTable(hashtableHashcodes, hashtableBuckets, fastModMultiplier);
        }
 
        /// <summary>
        /// Given a hash code, return the first index and last index for the associated matching entries.
        /// </summary>
        /// <param name="hashCode">The hash code to probe for.</param>
        /// <param name="startIndex">A variable that receives the index of the first matching entry.</param>
        /// <param name="endIndex">A variable that receives the index of the last matching entry plus 1.</param>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void FindMatchingEntries(int hashCode, out int startIndex, out int endIndex)
        {
            Bucket[] buckets = _buckets;
            ref Bucket b = ref buckets[HashHelpers.FastMod((uint)hashCode, (uint)buckets.Length, _fastModMultiplier)];
            startIndex = b.StartIndex;
            endIndex = b.EndIndex;
        }
 
        public int Count => HashCodes.Length;
 
        internal int[] HashCodes { get; }
 
        /// <summary>
        /// Given a span of hash codes, figure out the best number of hash buckets to use.
        /// </summary>
        /// <remarks>
        /// This tries to select a prime number of buckets. Rather than iterating through all possible bucket
        /// sizes, starting at the exact number of hash codes and incrementing the bucket count by 1 per trial,
        /// this is a trade-off between speed of determining a good number of buckets and maximal density.
        /// </remarks>
        private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool hashCodesAreUnique)
        {
            Debug.Assert(hashCodes.Length != 0);
            Debug.Assert(!hashCodesAreUnique || new HashSet<int>(hashCodes.ToArray()).Count == hashCodes.Length);
 
            const double AcceptableCollisionRate = 0.05;  // What is a satisfactory rate of hash collisions?
            const int LargeInputSizeThreshold = 1000;     // What is the limit for an input to be considered "small"?
            const int MaxSmallBucketTableMultiplier = 16; // How large a bucket table should be allowed for small inputs?
            const int MaxLargeBucketTableMultiplier = 3;  // How large a bucket table should be allowed for large inputs?
 
            // Filter out duplicate codes, since no increase in buckets will avoid collisions from duplicate input hash codes.
            HashSet<int>? codes = null;
            int uniqueCodesCount = hashCodes.Length;
            if (!hashCodesAreUnique)
            {
                codes =
#if NET
                    new HashSet<int>(hashCodes.Length);
#else
                    new HashSet<int>();
#endif
                foreach (int hashCode in hashCodes)
                {
                    codes.Add(hashCode);
                }
                uniqueCodesCount = codes.Count;
            }
            Debug.Assert(uniqueCodesCount != 0);
 
            // Based on our observations, in more than 99.5% of cases the number of buckets that meets our criteria is
            // at least twice as big as the number of unique hash codes.
            int minNumBuckets = uniqueCodesCount * 2;
 
            // In our precomputed primes table, find the index of the smallest prime that's at least as large as our number of
            // hash codes. If there are more codes than in our precomputed primes table, which accommodates millions of values,
            // give up and just use the next prime.
            ReadOnlySpan<int> primes = HashHelpers.Primes;
            int minPrimeIndexInclusive = 0;
            while ((uint)minPrimeIndexInclusive < (uint)primes.Length && minNumBuckets > primes[minPrimeIndexInclusive])
            {
                minPrimeIndexInclusive++;
            }
 
            if (minPrimeIndexInclusive >= primes.Length)
            {
                return HashHelpers.GetPrime(uniqueCodesCount);
            }
 
            // Determine the largest number of buckets we're willing to use, based on a multiple of the number of inputs.
            // For smaller inputs, we allow for a larger multiplier.
            int maxNumBuckets =
                uniqueCodesCount *
                (uniqueCodesCount >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier);
 
            // Find the index of the smallest prime that accommodates our max buckets.
            int maxPrimeIndexExclusive = minPrimeIndexInclusive;
            while ((uint)maxPrimeIndexExclusive < (uint)primes.Length && maxNumBuckets > primes[maxPrimeIndexExclusive])
            {
                maxPrimeIndexExclusive++;
            }
 
            if (maxPrimeIndexExclusive < primes.Length)
            {
                Debug.Assert(maxPrimeIndexExclusive != 0);
                maxNumBuckets = primes[maxPrimeIndexExclusive - 1];
            }
 
            const int BitsPerInt32 = 32;
            int[] seenBuckets = ArrayPool<int>.Shared.Rent((maxNumBuckets / BitsPerInt32) + 1);
 
            int bestNumBuckets = maxNumBuckets;
            int bestNumCollisions = uniqueCodesCount;
            int numBuckets = 0, numCollisions = 0;
 
            // Iterate through each available prime between the min and max discovered. For each, compute
            // the collision ratio.
            for (int primeIndex = minPrimeIndexInclusive; primeIndex < maxPrimeIndexExclusive; primeIndex++)
            {
                // Get the number of buckets to try, and clear our seen bucket bitmap.
                numBuckets = primes[primeIndex];
                Array.Clear(seenBuckets, 0, Math.Min(numBuckets, seenBuckets.Length));
 
                // Determine the bucket for each hash code and mark it as seen. If it was already seen,
                // track it as a collision.
                numCollisions = 0;
 
                if (codes is not null && uniqueCodesCount != hashCodes.Length)
                {
                    foreach (int code in codes)
                    {
                        if (!IsBucketFirstVisit(code))
                        {
                            break;
                        }
                    }
                }
                else
                {
                    // All of the hash codes in hashCodes are unique. In such scenario, it's faster to iterate over a span.
                    foreach (int code in hashCodes)
                    {
                        if (!IsBucketFirstVisit(code))
                        {
                            break;
                        }
                    }
                }
 
                // If this evaluation resulted in fewer collisions, use it as the best instead.
                // And if it's below our collision threshold, we're done.
                if (numCollisions < bestNumCollisions)
                {
                    bestNumBuckets = numBuckets;
 
                    if (numCollisions / (double)uniqueCodesCount <= AcceptableCollisionRate)
                    {
                        break;
                    }
 
                    bestNumCollisions = numCollisions;
                }
            }
 
            ArrayPool<int>.Shared.Return(seenBuckets);
 
            return bestNumBuckets;
 
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            bool IsBucketFirstVisit(int code)
            {
                uint bucketNum = (uint)code % (uint)numBuckets;
                if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
                {
                    numCollisions++;
                    if (numCollisions >= bestNumCollisions)
                    {
                        // If we've already hit the previously known best number of collisions,
                        // there's no point in continuing as worst case we'd just use that.
                        return false;
                    }
                }
                else
                {
                    seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
                }
 
                return true;
            }
        }
 
        private readonly struct Bucket
        {
            public readonly int StartIndex;
            public readonly int EndIndex;
 
            public Bucket(int startIndex, int count)
            {
                Debug.Assert(count > 0);
 
                StartIndex = startIndex;
                EndIndex = startIndex + count - 1;
            }
        }
    }
}