File: FindSymbols\SparseNgramTests.cs
Web Access
Project: src\src\Workspaces\CoreTest\Microsoft.CodeAnalysis.Workspaces.UnitTests.csproj (Microsoft.CodeAnalysis.Workspaces.UnitTests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.CodeAnalysis.Collections;
using Microsoft.CodeAnalysis.FindSymbols;
using Microsoft.CodeAnalysis.Shared.Utilities;
using Xunit;
 
namespace Microsoft.CodeAnalysis.UnitTests.FindSymbols;
 
public sealed class SparseNgramTests
{
    private static HashSet<string> GetAllNgrams(string text)
    {
        using var results = TemporaryArray<(int start, int length)>.Empty;
        SparseNgramGenerator.BuildAllNgrams(text, ref results.AsRef());
 
        var set = new HashSet<string>();
        foreach (var (start, length) in results)
            set.Add(text.Substring(start, length));
 
        return set;
    }
 
    private static HashSet<string> GetCoveringNgrams(string text)
    {
        using var results = TemporaryArray<(int start, int length)>.Empty;
        SparseNgramGenerator.BuildCoveringNgrams(text, ref results.AsRef());
 
        var set = new HashSet<string>();
        foreach (var (start, length) in results)
            set.Add(text.Substring(start, length));
 
        return set;
    }
 
    #region BuildAllNgrams — matches reference implementation
 
    [Fact]
    public void BuildAll_TwoChars_NoNgrams()
    {
        var ngrams = GetAllNgrams("he");
        Assert.Empty(ngrams);
    }
 
    [Fact]
    public void BuildAll_ThreeChars_SingleTrigram()
    {
        var ngrams = GetAllNgrams("hel");
        Assert.Single(ngrams);
        Assert.Contains("hel", ngrams);
    }
 
    [Fact]
    public void BuildAll_FourChars()
    {
        // Reference: BuildAllNgrams("hell") -> {"hel", "ell"}
        var ngrams = GetAllNgrams("hell");
        Assert.Contains("hel", ngrams);
        Assert.Contains("ell", ngrams);
    }
 
    [Fact]
    public void BuildAll_HelloWorld_MatchesReference()
    {
        // Reference: BuildAllNgrams("hello world") produces:
        //   {"hel", "ell", "llo", "lo ", "o w", "lo w", " wo", "lo wo", "wor", "orl", "worl", "rld"}
        var ngrams = GetAllNgrams("hello world");
        Assert.Contains("hel", ngrams);
        Assert.Contains("ell", ngrams);
        Assert.Contains("llo", ngrams);
        Assert.Contains("lo ", ngrams);
        Assert.Contains("o w", ngrams);
        Assert.Contains("lo w", ngrams);
        Assert.Contains(" wo", ngrams);
        Assert.Contains("lo wo", ngrams);
        Assert.Contains("wor", ngrams);
        Assert.Contains("orl", ngrams);
        Assert.Contains("worl", ngrams);
        Assert.Contains("rld", ngrams);
    }
 
    [Fact]
    public void BuildAll_Chester_MatchesReference()
    {
        // Reference: BuildAllNgrams("chester ") produces:
        //   {"che", "hes", "ches", "est", "chest", "ste", "ter", "ster", "er "}
        var ngrams = GetAllNgrams("chester ");
        Assert.Contains("che", ngrams);
        Assert.Contains("hes", ngrams);
        Assert.Contains("ches", ngrams);
        Assert.Contains("est", ngrams);
        Assert.Contains("chest", ngrams);
        Assert.Contains("ste", ngrams);
        Assert.Contains("ter", ngrams);
        Assert.Contains("ster", ngrams);
        Assert.Contains("er ", ngrams);
    }
 
    [Fact]
    public void BuildAll_ForLoop_MatchesReference()
    {
        // Reference: BuildAllNgrams("for(int i=42") produces:
        //   {"for", "or(", "for(", "r(i", "for(i", "(in", "int", "(int", "nt ",
        //    "t i", " i=", "t i=", "i=4", "t i=4", "nt i=4", "(int i=4", "=42"}
        var ngrams = GetAllNgrams("for(int i=42");
        Assert.Contains("for", ngrams);
        Assert.Contains("or(", ngrams);
        Assert.Contains("for(", ngrams);
        Assert.Contains("r(i", ngrams);
        Assert.Contains("for(i", ngrams);
        Assert.Contains("(in", ngrams);
        Assert.Contains("int", ngrams);
        Assert.Contains("(int", ngrams);
        Assert.Contains("nt ", ngrams);
        Assert.Contains("t i", ngrams);
        Assert.Contains(" i=", ngrams);
        Assert.Contains("t i=", ngrams);
        Assert.Contains("i=4", ngrams);
        Assert.Contains("t i=4", ngrams);
        Assert.Contains("nt i=4", ngrams);
        Assert.Contains("(int i=4", ngrams);
        Assert.Contains("=42", ngrams);
    }
 
    [Fact]
    public void BuildAll_BoundedSize()
    {
        // The algorithm produces at most 2n-2 n-grams for a string of length n.
        var text = "abcdefghijklmnop";
        using var results = TemporaryArray<(int start, int length)>.Empty;
        SparseNgramGenerator.BuildAllNgrams(text, ref results.AsRef());
        Assert.True(results.Count <= 2 * text.Length - 2,
            $"Expected at most {2 * text.Length - 2} n-grams, got {results.Count}");
    }
 
    [Fact]
    public void BuildAll_AllNgramsAtLeastThreeChars()
    {
        var ngrams = GetAllNgrams("abcdefghij");
        Assert.DoesNotContain(ngrams, g => g.Length < SparseNgramGenerator.MinNgramLength);
    }
 
    #endregion
 
    #region BuildAllNgrams — negative
 
    [Fact]
    public void BuildAll_EmptyString_NoNgrams()
    {
        Assert.Empty(GetAllNgrams(""));
    }
 
    [Fact]
    public void BuildAll_OneChar_NoNgrams()
    {
        Assert.Empty(GetAllNgrams("a"));
    }
 
    [Fact]
    public void BuildAll_TwoChars_NoNgrams2()
    {
        Assert.Empty(GetAllNgrams("ab"));
    }
 
    #endregion
 
    #region BuildCoveringNgrams — matches reference implementation
 
    [Fact]
    public void BuildCovering_TwoChars_NoNgrams()
    {
        Assert.Empty(GetCoveringNgrams("he"));
    }
 
    [Fact]
    public void BuildCovering_ThreeChars_SingleTrigram()
    {
        var ngrams = GetCoveringNgrams("hel");
        Assert.Single(ngrams);
        Assert.Contains("hel", ngrams);
    }
 
    [Fact]
    public void BuildCovering_FourChars()
    {
        // Reference: BuildCoveringNgrams("hell") -> {"hel", "ell"}
        var ngrams = GetCoveringNgrams("hell");
        Assert.Contains("hel", ngrams);
        Assert.Contains("ell", ngrams);
    }
 
    [Fact]
    public void BuildCovering_HelloWorld_MatchesReference()
    {
        // Reference: BuildCoveringNgrams("hello world") produces:
        //   {"hel", "ell", "llo", "rld", "worl", "lo wo"}
        var ngrams = GetCoveringNgrams("hello world");
        Assert.Contains("hel", ngrams);
        Assert.Contains("ell", ngrams);
        Assert.Contains("llo", ngrams);
        Assert.Contains("rld", ngrams);
        Assert.Contains("worl", ngrams);
        Assert.Contains("lo wo", ngrams);
    }
 
    [Fact]
    public void BuildCovering_ChesterSpace_MatchesReference()
    {
        // Reference: BuildCoveringNgrams("chester ") -> {"chest", "ster", "er "}
        var ngrams = GetCoveringNgrams("chester ");
        Assert.Contains("chest", ngrams);
        Assert.Contains("ster", ngrams);
        Assert.Contains("er ", ngrams);
    }
 
    [Fact]
    public void BuildCovering_Chester_MatchesReference()
    {
        // Reference: BuildCoveringNgrams("chester") -> {"chest", "ster"}
        var ngrams = GetCoveringNgrams("chester");
        Assert.Contains("chest", ngrams);
        Assert.Contains("ster", ngrams);
    }
 
    [Fact]
    public void BuildCovering_ForLoop_MatchesReference()
    {
        // Reference: BuildCoveringNgrams("for(int i=42") -> {"for(i", "(int i=4", "=42"}
        var ngrams = GetCoveringNgrams("for(int i=42");
        Assert.Contains("for(i", ngrams);
        Assert.Contains("(int i=4", ngrams);
        Assert.Contains("=42", ngrams);
    }
 
    [Fact]
    public void BuildCovering_IsSubsetOfBuildAll()
    {
        var text = "hello world";
        var all = GetAllNgrams(text);
        var covering = GetCoveringNgrams(text);
        Assert.True(covering.IsSubsetOf(all),
            $"Covering n-grams should be a subset of all n-grams. Extra: {string.Join(", ", covering.Except(all))}");
    }
 
    [Fact]
    public void BuildCovering_BoundedSize()
    {
        // The covering set produces at most n-2 n-grams.
        var text = "abcdefghijklmnop";
        using var results = TemporaryArray<(int start, int length)>.Empty;
        SparseNgramGenerator.BuildCoveringNgrams(text, ref results.AsRef());
        Assert.True(results.Count <= text.Length - 2,
            $"Expected at most {text.Length - 2} covering n-grams, got {results.Count}");
    }
 
    #endregion
 
    #region CoveringNgramsProbablyContained
 
    [Fact]
    public void CoveringCheck_AllPresent_ReturnsTrue()
    {
        var filter = BuildFilterForText("readline");
        Assert.True(SparseNgramGenerator.CoveringNgramsProbablyContained("readline", filter));
    }
 
    [Fact]
    public void CoveringCheck_SubstringPresent_ReturnsTrue()
    {
        var filter = BuildFilterForText("readline");
        Assert.True(SparseNgramGenerator.CoveringNgramsProbablyContained("read", filter));
    }
 
    [Fact]
    public void CoveringCheck_DifferentText_ReturnsFalse()
    {
        var filter = BuildFilterForText("readline");
        Assert.False(SparseNgramGenerator.CoveringNgramsProbablyContained("xyzwvq", filter));
    }
 
    [Fact]
    public void CoveringCheck_ShortText_BelowMinLength_ReturnsTrue()
    {
        var filter = BuildFilterForText("readline");
        Assert.True(SparseNgramGenerator.CoveringNgramsProbablyContained("re", filter));
    }
 
    [Fact]
    public void CoveringCheck_PartialOverlap_ReturnsFalse()
    {
        var filter = BuildFilterForText("goobar");
        Assert.False(SparseNgramGenerator.CoveringNgramsProbablyContained("goobaz", filter));
    }
 
    [Fact]
    public void CoveringCheck_CaseSensitive_DifferentCase_ReturnsFalse()
    {
        var filter = BuildFilterForText("readline");
        Assert.False(SparseNgramGenerator.CoveringNgramsProbablyContained("READLINE", filter));
    }
 
    #endregion
 
    private static BloomFilter BuildFilterForText(string text)
    {
        using var ngrams = TemporaryArray<(int start, int length)>.Empty;
        SparseNgramGenerator.BuildAllNgrams(text, ref ngrams.AsRef());
 
        var strings = new HashSet<string>();
        foreach (var (start, length) in ngrams)
            strings.Add(text.Substring(start, length));
 
        return new BloomFilter(0.01, isCaseSensitive: true, strings);
    }
}