File: Shared\Utilities\BloomFilter.cs
Web Access
Project: src\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
 
namespace Microsoft.CodeAnalysis.Shared.Utilities;
 
internal sealed partial class BloomFilter
{
    // From MurmurHash:
    // 'm' and 'r' are mixing constants generated off-line.
    // The values for m and r are chosen through experimentation and 
    // supported by evidence that they work well.
    private const uint Compute_Hash_m = 0x5bd1e995;
    private const int Compute_Hash_r = 24;
 
    private readonly BitArray _bitArray;
    private readonly int _hashFunctionCount;
    private readonly bool _isCaseSensitive;
 
    /// <summary><![CDATA[
    /// 1) n  = Number of items in the filter
    /// 
    /// 2) p = Probability of false positives, (a double between 0 and 1).
    /// 
    /// 3) m = Number of bits in the filter
    /// 
    /// 4) k = Number of hash functions
    /// 
    /// m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0)))))
    /// 
    /// k = round(log(2.0) * m / n)
    /// ]]></summary>
    private BloomFilter(int expectedCount, double falsePositiveProbability, bool isCaseSensitive)
    {
        var m = Math.Max(1, ComputeM(expectedCount, falsePositiveProbability));
        var k = Math.Max(1, ComputeK(expectedCount, falsePositiveProbability));
 
        // We must have size in even bytes, so that when we deserialize from bytes we get a bit array with the same count.
        // The count is used by the hash functions.
        var sizeInEvenBytes = (m + 7) & ~7;
 
        _bitArray = new BitArray(length: sizeInEvenBytes);
        _hashFunctionCount = k;
        _isCaseSensitive = isCaseSensitive;
    }
 
    public BloomFilter(double falsePositiveProbability, bool isCaseSensitive, HashSet<string> values)
        : this(values.Count, falsePositiveProbability, isCaseSensitive)
    {
        AddRange(values);
    }
 
    public BloomFilter(
        double falsePositiveProbability,
        HashSet<string> stringValues,
        HashSet<long> longValues)
        : this(stringValues.Count + longValues.Count, falsePositiveProbability, isCaseSensitive: false)
    {
        AddRange(stringValues);
        AddRange(longValues);
    }
 
    private BloomFilter(BitArray bitArray, int hashFunctionCount, bool isCaseSensitive)
    {
        _bitArray = bitArray ?? throw new ArgumentNullException(nameof(bitArray));
        _hashFunctionCount = hashFunctionCount;
        _isCaseSensitive = isCaseSensitive;
    }
 
    // m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0)))))
    private static int ComputeM(int expectedCount, double falsePositiveProbability)
    {
        var p = falsePositiveProbability;
        double n = expectedCount;
 
        var numerator = n * Math.Log(p);
        var denominator = Math.Log(1.0 / Math.Pow(2.0, Math.Log(2.0)));
        return unchecked((int)Math.Ceiling(numerator / denominator));
    }
 
    // k = round(log(2.0) * m / n)
    private static int ComputeK(int expectedCount, double falsePositiveProbability)
    {
        double n = expectedCount;
        double m = ComputeM(expectedCount, falsePositiveProbability);
 
        var temp = Math.Log(2.0) * m / n;
        return unchecked((int)Math.Round(temp));
    }
 
    /// <summary>
    /// Modification of the murmurhash2 algorithm.  Code is simpler because it operates over
    /// strings instead of byte arrays.  Because each string character is two bytes, it is known
    /// that the input will be an even number of bytes (though not necessarily a multiple of 4).
    /// 
    /// This is needed over the normal 'string.GetHashCode()' because we need to be able to generate
    /// 'k' different well distributed hashes for any given string s.  Also, we want to be able to
    /// generate these hashes without allocating any memory.  My ideal solution would be to use an
    /// MD5 hash.  However, there appears to be no way to do MD5 in .NET where you can:
    /// 
    /// a) feed it individual values instead of a byte[]
    /// 
    /// b) have the hash computed into a byte[] you provide instead of a newly allocated one
    /// 
    /// Generating 'k' pieces of garbage on each insert and lookup seems very wasteful.  So,
    /// instead, we use murmur hash since it provides well distributed values, allows for a
    /// seed, and allocates no memory.
    /// 
    /// Murmur hash is public domain.  Actual code is included below as reference.
    /// </summary>
    private static int ComputeHash(string key, int seed, bool isCaseSensitive)
    {
        unchecked
        {
            // Initialize the hash to a 'random' value
 
            var numberOfCharsLeft = key.Length;
            var h = (uint)(seed ^ numberOfCharsLeft);
 
            // Mix 4 bytes at a time into the hash.  NOTE: 4 bytes is two chars, so we iterate
            // through the string two chars at a time.
            var index = 0;
            while (numberOfCharsLeft >= 2)
            {
                var c1 = GetCharacter(key, index, isCaseSensitive);
                var c2 = GetCharacter(key, index + 1, isCaseSensitive);
 
                h = CombineTwoCharacters(h, c1, c2);
 
                index += 2;
                numberOfCharsLeft -= 2;
            }
 
            // Handle the last char (or 2 bytes) if they exist.  This happens if the original string had
            // odd length.
            if (numberOfCharsLeft == 1)
            {
                var c = GetCharacter(key, index, isCaseSensitive);
                h = CombineLastCharacter(h, c);
            }
 
            // Do a few final mixes of the hash to ensure the last few bytes are well-incorporated.
 
            h = FinalMix(h);
 
            return (int)h;
        }
    }
 
    private static int ComputeHash(long key, int seed)
    {
        // This is a duplicate of ComputeHash(string key, int seed).  However, because
        // we only have 64bits to encode we just unroll that function here.  See
        // Other function for documentation on what's going on here.
        unchecked
        {
            // Initialize the hash to a 'random' value
 
            var numberOfCharsLeft = 4;
            var h = (uint)(seed ^ numberOfCharsLeft);
 
            // Mix 4 bytes at a time into the hash.  NOTE: 4 bytes is two chars, so we iterate
            // through the long two chars at a time.
            var index = 0;
            while (numberOfCharsLeft >= 2)
            {
                var c1 = GetCharacter(key, index);
                var c2 = GetCharacter(key, index + 1);
 
                h = CombineTwoCharacters(h, c1, c2);
 
                index += 2;
                numberOfCharsLeft -= 2;
            }
 
            Debug.Assert(numberOfCharsLeft == 0);
 
            // Do a few final mixes of the hash to ensure the last few bytes are well-incorporated.
            h = FinalMix(h);
 
            return (int)h;
        }
    }
 
    private static uint CombineLastCharacter(uint h, uint c)
    {
        unchecked
        {
            h ^= c;
            h *= Compute_Hash_m;
            return h;
        }
    }
 
    private static uint FinalMix(uint h)
    {
        unchecked
        {
            h ^= h >> 13;
            h *= Compute_Hash_m;
            h ^= h >> 15;
            return h;
        }
    }
 
    private static uint CombineTwoCharacters(uint h, uint c1, uint c2)
    {
        unchecked
        {
            var k = c1 | (c2 << 16);
 
            k *= Compute_Hash_m;
            k ^= k >> Compute_Hash_r;
            k *= Compute_Hash_m;
 
            h *= Compute_Hash_m;
            h ^= k;
 
            return h;
        }
    }
 
    private static char GetCharacter(string key, int index, bool isCaseSensitive)
    {
        var c = key[index];
        return isCaseSensitive ? c : char.ToLowerInvariant(c);
    }
 
    private static char GetCharacter(long key, int index)
    {
        Debug.Assert(index <= 3);
        return (char)(key >> (16 * index));
    }
 
#if false
    //-----------------------------------------------------------------------------
    // MurmurHash2, by Austin Appleby
    //
    // Note - This code makes a few assumptions about how your machine behaves -
    // 1. We can read a 4-byte value from any address without crashing
    // 2. sizeof(int) == 4
    //
    // And it has a few limitations -
    // 1. It will not work incrementally.
    // 2. It will not produce the same results on little-endian and big-endian
    //    machines.
    unsigned int MurmurHash2(const void* key, int len, unsigned int seed)
    {
        // 'm' and 'r' are mixing constants generated offline.
        // The values for m and r are chosen through experimentation and 
        // supported by evidence that they work well.
        
        const unsigned int m = 0x5bd1e995;
        const int r = 24;
 
        // Initialize the hash to a 'random' value
        unsigned int h = seed ^ len;
 
        // Mix 4 bytes at a time into the hash
        const unsigned char* data = (const unsigned char*)key;
 
        while(len >= 4)
        {
            unsigned int k = *(unsigned int*)data;
 
            k *= m; 
            k ^= k >> r; 
            k *= m; 
 
            h *= m; 
            h ^= k;
 
            data += 4;
            len -= 4;
        }
 
        // Handle the last few bytes of the input array
        switch(len)
        {
            case 3: h ^= data[2] << 16;
            case 2: h ^= data[1] << 8;
            case 1: h ^= data[0];
                    h *= m;
        };
 
        // Do a few final mixes of the hash to ensure the last few
        // bytes are well-incorporated.
 
        h ^= h >> 13;
        h *= m;
        h ^= h >> 15;
 
        return h;
    } 
#endif
 
    private void AddRange(HashSet<string> values)
    {
        foreach (var v in values)
        {
            Add(v);
        }
    }
 
    private void AddRange(HashSet<long> values)
    {
        foreach (var v in values)
        {
            Add(v);
        }
    }
 
    public void Add(string value)
    {
        for (var i = 0; i < _hashFunctionCount; i++)
        {
            var hash = ComputeHash(value, i, _isCaseSensitive);
            _bitArray[GetBitArrayIndexFromHash(hash)] = true;
        }
    }
 
    private int GetBitArrayIndexFromHash(int hash)
    {
        hash %= _bitArray.Length;
        return Math.Abs(hash);
    }
 
    public void Add(long value)
    {
        for (var i = 0; i < _hashFunctionCount; i++)
        {
            var hash = ComputeHash(value, i);
            _bitArray[GetBitArrayIndexFromHash(hash)] = true;
        }
    }
 
    public bool ProbablyContains(string value)
    {
        // Request an array of immutable hashes for this input. Note that it's possible
        // that the returned array might return a cached entry calculated by a different
        // bloom filter and thus might have more entries than we need, but it's ok as
        // it's guaranteed that the first _hashFunctionCount of those values are the values
        // we would have computed had we not used the cache.
        var hashes = BloomFilterHash.GetOrCreateHashArray(value, _isCaseSensitive, _hashFunctionCount);
 
        for (var i = 0; i < _hashFunctionCount; i++)
        {
            var hash = hashes[i];
            if (!_bitArray[GetBitArrayIndexFromHash(hash)])
            {
                return false;
            }
        }
 
        return true;
    }
 
    public bool ProbablyContains(long value)
    {
        for (var i = 0; i < _hashFunctionCount; i++)
        {
            var hash = ComputeHash(value, i);
            if (!_bitArray[GetBitArrayIndexFromHash(hash)])
            {
                return false;
            }
        }
 
        return true;
    }
 
    public bool IsEquivalent(BloomFilter filter)
    {
        return IsEquivalent(_bitArray, filter._bitArray)
            && _hashFunctionCount == filter._hashFunctionCount
            && _isCaseSensitive == filter._isCaseSensitive;
    }
 
    private static bool IsEquivalent(BitArray array1, BitArray array2)
    {
        if (array1.Length != array2.Length)
        {
            return false;
        }
 
        for (var i = 0; i < array1.Length; i++)
        {
            if (array1[i] != array2[i])
            {
                return false;
            }
        }
 
        return true;
    }
 
    /// <summary>
    /// Provides mechanism to efficiently obtain bloom filter hash for a value. Backed by a single element cache.
    /// </summary>
    internal sealed class BloomFilterHash
    {
        private static BloomFilterHash? s_cachedHash;
 
        private readonly string _value;
        private readonly bool _isCaseSensitive;
        private readonly ImmutableArray<int> _hashes;
 
        private BloomFilterHash(string value, bool isCaseSensitive, int hashFunctionCount)
        {
            _value = value;
            _isCaseSensitive = isCaseSensitive;
 
            var hashBuilder = new FixedSizeArrayBuilder<int>(hashFunctionCount);
 
            for (var i = 0; i < hashFunctionCount; i++)
                hashBuilder.Add(BloomFilter.ComputeHash(value, i, _isCaseSensitive));
 
            _hashes = hashBuilder.MoveToImmutable();
        }
 
        /// <summary>
        /// Although calculating this hash isn't terribly expensive, it does involve multiple
        /// (usually around 13) hashings of the string (the actual count is <see cref="BloomFilter._hashFunctionCount"/>).
        /// The typical usage pattern of bloom filters is that some operation (eg: find references)
        /// requires asking a multitude of bloom filters whether a particular value is likely contained.
        /// The vast majority of those bloom filters will end up hashing that string to the same values, so
        /// we put those values into a simple cache and see if it can be used before calculating.
        /// Local testing has put the hit rate of this at around 99%.
        ///
        /// Note that it's possible for this method to return an array from the cache longer than hashFunctionCount,
        /// but if so, it's guaranteed that the values returned in the first hashFunctionCount entries are
        /// the same as if the cache hadn't been used.
        /// </summary>
        public static ImmutableArray<int> GetOrCreateHashArray(string value, bool isCaseSensitive, int hashFunctionCount)
        {
            var cachedHash = s_cachedHash;
 
            // Not an equivalency check on the hashFunctionCount as a longer array is ok. This is because the
            // values in the array are determined by value and isCaseSensitive and hashFunctionCount is simply
            // used to determine the length of the returned array. As long as the cached entry matches the value
            // and isCaseSensitive and is at least as long as we need, then we can use it.
            if (cachedHash == null
                || cachedHash._isCaseSensitive != isCaseSensitive
                || cachedHash._hashes.Length < hashFunctionCount
                || cachedHash._value != value)
            {
                cachedHash = new BloomFilterHash(value, isCaseSensitive, hashFunctionCount);
                s_cachedHash = cachedHash;
            }
 
            return cachedHash._hashes;
        }
 
        // Used only by tests
        internal static bool TryGetCachedEntry(out bool isCaseSensitive, out string value)
        {
            var cachedHash = s_cachedHash;
 
            if (cachedHash == null)
            {
                isCaseSensitive = false;
                value = string.Empty;
 
                return false;
            }
 
            isCaseSensitive = cachedHash._isCaseSensitive;
            value = cachedHash._value;
 
            return true;
        }
 
        internal static void ResetCachedEntry()
        {
            s_cachedHash = null;
        }
    }
}