SparseNgramGenerator.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using Microsoft.CodeAnalysis.Collections;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Utilities;

namespace Microsoft.CodeAnalysis.FindSymbols;

/// <summary>
/// Generates sparse variable-length n-grams from a string, using the hash-based monotonic-stack
/// algorithm from GitHub's Blackbird code search engine.
/// <para/>
/// Ported from the C++ reference implementation at https://github.com/danlark1/sparse_ngrams
/// (Boost License 1.0). See also: https://github.blog/2023-02-06-the-technology-behind-githubs-new-code-search/
/// <para/>
/// <b>Why not fixed-length trigrams?</b> Traditional code search indices (Russ Cox's codesearch,
/// Zoekt/Sourcegraph) use fixed trigrams. A 3-character trigram over a 38-character alphabet
/// (a-z, 0-9, underscore, "other") has only ~54K possible values, so any given trigram matches a
/// large fraction of documents. Searching for "ReadLine" produces trigrams {"Rea","ead","adL","dLi",
/// "Lin","ine"} — each individually matches many documents, and only the AND of all six narrows it
/// down. Longer n-grams are exponentially more selective, but naively generating all overlapping
/// n-grams of all lengths would be O(n²).
/// <para/>
/// <b>The sparse n-gram idea:</b> instead of extracting every possible substring, use bigram hashes
/// to deterministically select a <em>sparse subset</em> of variable-length n-grams whose boundaries
/// fall at "hash valleys" (local minima in the bigram hash sequence). The monotonic stack naturally
/// finds these valleys: when a new bigram has a higher hash than the stack top, the top is a valley
/// and the span from that valley to the current position becomes an n-gram. This produces longer
/// n-grams through regions where hashes are monotonically increasing, and shorter ones at valleys —
/// giving much higher selectivity than fixed trigrams, with bounded output size (at most 2n−2 for
/// indexing, n−2 for querying).
/// <para/>
/// <b>Running example:</b> consider indexing the identifier "GetValue". The bigram positions are:
/// <code>
///   pos: 0:"Ge" 1:"et" 2:"tV" 3:"Va" 4:"al" 5:"lu" 6:"ue"
/// </code>
/// Suppose the hash values create valleys at positions 1 and 5. The algorithm emits n-grams that
/// span from each valley to the next boundary: e.g. "etVa" (pos 1–4), "lue" (pos 5–7), plus
/// n-grams connecting across valleys like "GetV" (pos 0–3). The exact n-grams depend on the hash
/// function, but the key property is: they are <em>deterministic</em> for a given string, so the
/// same n-grams are produced at index time and query time.
/// <para/>
/// At query time, <see cref="BuildCoveringNgrams"/> extracts a <em>covering</em> subset — the
/// fewest, longest n-grams that together span the entire query string. If the query is "GetValue",
/// and the covering set is {"GetVal", "alue"}, then a document can only match if its Bloom filter
/// contains both. Because these are 6 and 4 characters long (not just 3), they have far fewer false
/// positives than any individual trigram would.
/// </summary>
internal static class SparseNgramGenerator
{
    /// <summary>
    /// The shortest n-gram the algorithm can produce is 3 characters (a trigram). This happens
    /// when two adjacent bigram positions form a valley — the span from one to the next covers
    /// exactly 3 characters. Strings shorter than this cannot produce any n-grams.
    /// </summary>
    public const int MinNgramLength = 3;

    // Hash constants from the reference implementation (https://github.com/danlark1/sparse_ngrams).
    // These are mixing constants chosen to spread bigram values across the uint range, ensuring
    // that n-gram boundaries are well-distributed and don't cluster on common character pairs.
    private const ulong Mul1 = 0xc6a4a7935bd1e995UL;
    private const ulong Mul2 = 0x228876a7198b743UL;

    /// <summary>
    /// Hashes the bigram at <paramref name="pos"/> (characters at pos and pos+1) into a uint.
    /// The hash determines where n-gram boundaries fall: valleys in the hash sequence become
    /// n-gram start/end points. A good hash function here is critical — it must spread common
    /// bigrams widely so that boundaries don't cluster on frequent character pairs like "th" or "er".
    /// </summary>
    private static uint HashBigram(ReadOnlySpan<char> text, int pos)
    {
        var a = text[pos] * Mul1 + text[pos + 1] * Mul2;
        return (uint)(a + (~a >> 47));
    }

    /// <summary>
    /// Produces the full set of sparse n-grams for indexing (at most 2n−2). These are stored in the
    /// document's Bloom filter at index time so that queries can later check for their presence.
    /// <para/>
    /// The algorithm maintains a stack of bigram positions ordered by increasing hash value (a
    /// "monotonic stack"). As we scan left to right, each new bigram either continues the ascending
    /// run (pushed onto the stack) or breaks it (pops entries with lower hashes). Each pop reveals
    /// a hash valley — a position where the hash was locally minimal — and emits an n-gram spanning
    /// from that valley to the current position. This is why the algorithm produces <em>variable-
    /// length</em> n-grams: a long ascending hash run produces a single long n-gram, while a
    /// sequence of valleys produces many short ones.
    /// <para/>
    /// Two kinds of n-grams are emitted per iteration:
    /// <list type="number">
    /// <item><b>Popped n-grams:</b> when the current hash exceeds the stack top, the top is popped
    /// and an n-gram from that position to the current bigram's end is emitted. This captures the
    /// span "guarded" by the valley.</item>
    /// <item><b>Extension n-gram:</b> after all pops, if the stack is non-empty, an n-gram from
    /// the new top to the current bigram's end is emitted. This extends the previous valley's n-gram
    /// to cover the current position, ensuring overlapping coverage of the entire string.</item>
    /// </list>
    /// Together, these two rules guarantee that every position in the string is covered by at least
    /// one indexed n-gram — the property that makes query-time checking sound.
    /// </summary>
    public static void BuildAllNgrams(ReadOnlySpan<char> text, ref TemporaryArray<(int start, int length)> results)
    {
        if (text.Length < MinNgramLength)
            return;

        using var _ = ArrayBuilder<(uint hash, int pos)>.GetInstance(out var stack);

        for (var i = 0; i + 2 <= text.Length; i++)
        {
            var hash = HashBigram(text, i);

            // Pop all stack entries with hashes ≤ the current hash. Each popped entry represents
            // a hash valley whose n-gram now terminates at the current bigram's end (position i+2).
            while (stack.Count > 0 && hash > stack[^1].hash)
            {
                var ngramStart = stack[^1].pos;
                var ngramLength = i + 2 - ngramStart;
                results.Add((ngramStart, ngramLength));

                // Collapse consecutive entries with the same hash value. When multiple bigrams hash
                // identically, they form a plateau — only the leftmost one matters as an n-gram
                // boundary, since the others would produce duplicate substrings of the same span.
                while (stack.Count > 1 && stack[^1].hash == stack[^2].hash)
                    stack.Pop();

                stack.Pop();
            }

            // Extend the current stack top's n-gram to include the current position. Without this,
            // the region between the last valley and the current position would not be covered by
            // any indexed n-gram, potentially causing false negatives at query time.
            if (stack.Count > 0)
            {
                var ngramStart = stack[^1].pos;
                var ngramLength = i + 2 - ngramStart;
                results.Add((ngramStart, ngramLength));
            }

            stack.Add((hash, i));
        }
    }

    /// <summary>
    /// Produces a minimal covering set of sparse n-grams for querying (at most n−2). At query time
    /// we need to check the document index for the presence of the query's n-grams. Checking fewer,
    /// longer n-grams is strictly better: each is more selective (fewer false positives) and we do
    /// fewer Bloom filter lookups.
    /// <para/>
    /// This method finds the <em>covering set</em>: the smallest number of n-grams from
    /// <see cref="BuildAllNgrams"/>'s output that together span the entire query string, preferring
    /// the longest possible n-grams. The soundness guarantee is: if a document contains a substring
    /// matching the query, then every covering n-gram of that query was emitted by
    /// <see cref="BuildAllNgrams"/> when that document was indexed, and therefore is in the
    /// document's Bloom filter.
    /// <para/>
    /// The algorithm uses a deque (double-ended queue) as a monotonic structure. The deque tracks
    /// hash valleys from front (oldest/leftmost) to back (newest/rightmost). N-grams are emitted
    /// in two situations:
    /// <list type="number">
    /// <item><b>Length cap:</b> when the span from the front of the deque to the current position
    /// would exceed <paramref name="maxNgramLength"/>, the front entry is evicted and an n-gram
    /// from the front to the second entry is emitted. This prevents impractically long n-grams that
    /// would rarely match and waste Bloom filter bits.</item>
    /// <item><b>Valley collapse:</b> when the current bigram hash exceeds the back of the deque
    /// (breaking the monotonic-decreasing invariant), entries are popped and n-grams emitted,
    /// covering the region from each popped valley to the current position.</item>
    /// </list>
    /// </summary>
    public static void BuildCoveringNgrams(
        ReadOnlySpan<char> text,
        ref TemporaryArray<(int start, int length)> results,
        int maxNgramLength = 16)
    {
        if (text.Length < MinNgramLength)
            return;

        using var _ = Deque<(uint hash, int pos)>.GetInstance(out var deque);

        for (var i = 0; i + 2 <= text.Length; i++)
        {
            var hash = HashBigram(text, i);

            // Evict the front if the span from the front valley to the current bigram's end would
            // exceed the maximum n-gram length. Capping length keeps n-grams practical — an
            // extremely long n-gram would be so specific it's unlikely to appear in the Bloom
            // filter, offering no benefit over a moderately long one.
            if (deque.Count > 1 &&
                i - deque.First.pos + 3 >= maxNgramLength)
            {
                var frontPos = deque.First.pos;
                var secondPos = deque[1].pos;
                results.Add((frontPos, secondPos + 2 - frontPos));
                deque.RemoveFirst();
            }

            // Pop entries from the back whose hashes are ≤ the current hash, maintaining the
            // monotonic invariant. When the front and back have the same hash (the entire deque
            // is a single plateau), the plateau collapses: we emit n-grams covering the full
            // span, then unwind the deque to emit bridging n-grams between consecutive entries.
            while (deque.Count > 0 && hash > deque.Last.hash)
            {
                if (deque.First.hash == deque.Last.hash)
                {
                    var lastPos = deque.Last.pos;
                    results.Add((lastPos, i + 2 - lastPos));

                    // Unwind the entire deque, emitting bridging n-grams that connect each
                    // consecutive pair. This ensures the covering set has no gaps.
                    while (deque.Count > 1)
                    {
                        var lastPosition = deque.Last.pos + 2;
                        deque.RemoveLast();
                        results.Add((deque.Last.pos, lastPosition - deque.Last.pos));
                    }
                }

                deque.RemoveLast();
            }

            deque.AddLast((hash, i));
        }

        // Drain any remaining entries after the last bigram. The deque still holds valleys that
        // were never popped (because no subsequent bigram had a higher hash). Unwinding these
        // emits the final covering n-grams that span to the end of the string.
        while (deque.Count > 1)
        {
            var lastPosition = deque.Last.pos + 2;
            deque.RemoveLast();
            results.Add((deque.Last.pos, lastPosition - deque.Last.pos));
        }
    }

    /// <summary>
    /// Checks whether all covering n-grams of <paramref name="text"/> are probably present in
    /// <paramref name="filter"/>. This is the query-time entry point: if <em>any</em> covering
    /// n-gram is missing from the filter, the document definitely does not contain the query
    /// substring, and we can skip the expensive full match. Returns <see langword="true"/> for
    /// strings shorter than <see cref="MinNgramLength"/> because no n-grams can be extracted
    /// and we must fall back to the full match.
    /// </summary>
    public static bool CoveringNgramsProbablyContained(
        ReadOnlySpan<char> text, Shared.Utilities.BloomFilter filter)
    {
        if (text.Length < MinNgramLength)
            return true;

        using var ngrams = TemporaryArray<(int start, int length)>.Empty;
        BuildCoveringNgrams(text, ref ngrams.AsRef());

        foreach (var (start, length) in ngrams)
        {
            if (!filter.ProbablyContains(text.Slice(start, length)))
                return false;
        }

        return true;
    }
}
File: FindSymbols\TopLevelSyntaxTree\SparseNgramGenerator.cs	Web Access
Project: src\src\roslyn\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)