// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; using System.Collections.Generic; using System.Collections.Immutable; using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.Collections; using Microsoft.CodeAnalysis.Shared.Collections; namespace Roslyn.Utilities; /// <summary> /// NOTE: Only use if you truly need a BK-tree. If you just want to compare words, use the 'SpellChecker' type /// instead. /// <para/> /// An implementation of a Burkhard-Keller tree. Introduced in: /// <para/> /// 'Some approaches to best-match file searching.' Communications of the ACM CACM Volume 16 Issue 4, April 1973 /// Pages 230-236 http://dl.acm.org/citation.cfm?doid=362003.362025. /// </summary> internal readonly partial struct BKTree { public static readonly BKTree Empty = new([], [], []); /// <summary> /// We have three completely flat arrays of structs. These arrays fully represent the BK tree. The structure /// is as follows: /// <para/> /// The root node is in _nodes[0]. /// <para/> /// It lists the count of edges it has. These edges are in _edges in the range [0*, childCount). Each edge has /// the index of the child node it points to, and the edit distance between the parent and the child. /// <para/> /// * of course '0' is only for the root case. /// <para/> /// All nodes state where in _edges their child edges range starts, so the children for any node are in the /// range[node.FirstEdgeIndex, node.FirstEdgeIndex + node.EdgeCount). /// <para/> /// Each node also has an associated string. These strings are concatenated and stored in /// _concatenatedLowerCaseWords. Each node has a TextSpan that indicates which portion of the character array /// is their string. Note: i'd like to use an immutable array for the characters as well. However, we need to /// create slices, and they need to work on top of an ArraySlice (which needs a char[]). The edit distance code /// also wants to work on top of raw char[]s (both for speed, and so it can pool arrays to prevent lots of /// garbage). Because of that we just keep this as a char[]. /// </summary> private readonly char[] _concatenatedLowerCaseWords; private readonly ImmutableArray<Node> _nodes; private readonly ImmutableArray<Edge> _edges; private BKTree(char[] concatenatedLowerCaseWords, ImmutableArray<Node> nodes, ImmutableArray<Edge> edges) { Contract.ThrowIfNull(concatenatedLowerCaseWords, nameof(_concatenatedLowerCaseWords)); Contract.ThrowIfTrue(nodes.IsDefault, $"{nameof(nodes)}.{nameof(nodes.IsDefault)}"); Contract.ThrowIfTrue(edges.IsDefault, $"{nameof(edges)}.{nameof(edges.IsDefault)}"); _concatenatedLowerCaseWords = concatenatedLowerCaseWords; _nodes = nodes; _edges = edges; } public static BKTree Create(IEnumerable<string> values) => new Builder(values).Create(); public void Find(ref TemporaryArray<string> result, string value, int? threshold = null) { Contract.ThrowIfNull(value, nameof(value)); Contract.ThrowIfNull(_concatenatedLowerCaseWords, nameof(_concatenatedLowerCaseWords)); Contract.ThrowIfTrue(_nodes.IsDefault, $"{nameof(_nodes)}.{nameof(_nodes.IsDefault)}"); Contract.ThrowIfTrue(_edges.IsDefault, $"{nameof(_edges)}.{nameof(_edges.IsDefault)}"); if (_nodes.Length == 0) return; Span<char> lowerCaseCharacters = value.Length < 512 ? stackalloc char[value.Length] : new char[value.Length]; for (var i = 0; i < value.Length; i++) lowerCaseCharacters[i] = CaseInsensitiveComparison.ToLower(value[i]); threshold ??= WordSimilarityChecker.GetThreshold(value); Lookup(_nodes[0], lowerCaseCharacters, threshold.Value, ref result, recursionCount: 0); } private void Lookup( Node currentNode, Span<char> queryCharacters, int threshold, ref TemporaryArray<string> result, int recursionCount) { // Don't bother recursing too deeply in the case of pathological trees. // This really only happens when the actual code is strange (like // 10,000 symbols all a single letter long). In htat case, searching // down this path will be fairly fruitless anyways. // // Note: this won't affect good searches against good data even if this // pathological chain exists. That's because the good items will still // cluster near the root node in the tree, and won't be off the end of // this long chain. if (recursionCount > 256) { return; } // We may need to compute the real edit distance (ignoring any thresholds) in the case // where edges exist as we need that edit distance to appropriately determine which edges to walk // in the tree. var characterSpan = currentNode.WordSpan; // The GetEditDistance call below serves two purposes: // 1) To determine whether currentNode should be added to the result // 2) To determine whether children need to be searched // // If there are no edges, we don't need the information to determine case 2. So, as a // performance optimization, in that case we send in a threshold to GetEditDistance // that indicates only the work necessary to determine case 1 need be performed. var edgesExist = currentNode.EdgeCount > 0; var editDistance = EditDistance.GetEditDistance( _concatenatedLowerCaseWords.AsSpan(characterSpan.Start, characterSpan.Length), queryCharacters, edgesExist ? int.MaxValue : threshold); // Case 1 if (editDistance <= threshold) { // Found a match. result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length)); } // Case 2 if (edgesExist) { var min = editDistance - threshold; var max = editDistance + threshold; var startInclusive = currentNode.FirstEdgeIndex; var endExclusive = startInclusive + currentNode.EdgeCount; for (var i = startInclusive; i < endExclusive; i++) { var childEditDistance = _edges[i].EditDistance; if (min <= childEditDistance && childEditDistance <= max) { Lookup(_nodes[_edges[i].ChildNodeIndex], queryCharacters, threshold, ref result, recursionCount + 1); } } } } #if false // Used for diagnostic purposes. internal void DumpStats() { var sb = new StringBuilder(); sb.AppendLine("Nodes length: " + _nodes.Length); var childCountHistogram = new Dictionary<int, int>(); foreach (var node in _nodes) { var childCount = node.EdgeCount; int existing; childCountHistogram.TryGetValue(childCount, out existing); childCountHistogram[childCount] = existing + 1; } sb.AppendLine(); sb.AppendLine("Child counts:"); foreach (var kvp in childCountHistogram.OrderBy(kvp => kvp.Key)) { sb.AppendLine(kvp.Key + "\t" + kvp.Value); } // An item is dense if, starting from 1, at least 80% of it's array would be full. var densities = new int[11]; var empyCount = 0; foreach (var node in _nodes) { if (node.EdgeCount == 0) { empyCount++; continue; } var maxEditDistance = -1; var startInclusive = node.FirstEdgeIndex; var endExclusive = startInclusive + node.EdgeCount; for (var i = startInclusive; i < endExclusive; i++) { maxEditDistance = Max(maxEditDistance, _edges[i].EditDistance); } var editDistanceCount = node.EdgeCount; var bucket = 10 * editDistanceCount / maxEditDistance; densities[bucket]++; } var nonEmptyCount = _nodes.Length - empyCount; sb.AppendLine(); sb.AppendLine("NoChildren: " + empyCount); sb.AppendLine("AnyChildren: " + nonEmptyCount); sb.AppendLine("Densities:"); for (var i = 0; i < densities.Length; i++) { sb.AppendLine("<=" + i + "0% = " + densities[i] + ", " + ((float)densities[i] / nonEmptyCount)); } var result = sb.ToString(); } #endif } |