// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; using System.Collections.Generic; using System.Collections.Immutable; using System.Diagnostics; using System.Linq; using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.Text; namespace Roslyn.Utilities; internal readonly partial struct BKTree { private readonly struct Builder { // The number of edges we pre-allocate space for for each node in _compactEdges. // // To make the comments simpler below, i'll use '4' as a synonym for CompactEdgeAllocationSize. // '4' simply reads better and makes it clearer what's going on. private const int CompactEdgeAllocationSize = 4; // Instead of producing a char[] for each string we're building a node for, we instead // have one long char[] with all the chracters of each string concatenated. i.e. // "goo" "bar" and "baz" becomes { f, o, o, b, a, r, b, a, z }. Then in _wordSpans // we have the text spans for each of those words in this array. This gives us only // two allocations instead of as many allocations as the number of strings we have. // // Once we are done building, we pass this to the BKTree and its nodes also state the // span of this array that corresponds to the word they were created for. This works // well as other dependent facilities (like EditDistance) can work on sub-arrays without // any problems. private readonly char[] _concatenatedLowerCaseWords; private readonly TextSpan[] _wordSpans; // Note: while building a BKTree we have to store children with parents, keyed by the // edit distance between the two. Naive implementations might store a list or dictionary // of children along with each node. However, this would be very inefficient and would // put an enormous amount of memory pressure on the system. // // Emperical data for a nice large assembly like mscorlib gives us the following // information: // // Unique-Words (ignoring case): 9662 // // For each unique word we need a node in the BKTree. If we stored a list or dictionary // with each node, that would be 10s of thousands of objects created that would then // just have to be GCed. That's a lot of garbage pressure we'd like to avoid. // // Now if we look at all those nodes, we can see the following information about how many // children each has. // // Edge counts: // 0 5560 // 1 1884 // 2 887 // 3 527 // 4 322 // 5 200 // 6 114 // 7 69 // 8 47 // 9 20 // 10 8 // 11 10 // 12 7 // 13 4 // 15 1 // 16 1 // 54 1 // // // i.e. The number of nodes with edge-counts less than or equal to four is: 5560+1884+887+527+322=9180. // This is 95% of the total number of edges we are adding. Looking at many other dlls // we found that this ratio stays true across the board. i.e. with all dlls, 95% of nodes // have 4 or less edges. // // So, to optimize things, we pre-alloc a single array with space for 4 edges for each // node we're going to add. Each node then gets that much space to store edge information. // If it needs more than that space, then we have a fall-over dictionary that it can store // information in. // // Once building is complete, the GC only needs to deallocate this single array and the // spillover dictionaries. // // This approach produces 1/20th the amount of garbage while building the tree. // // Each node at index i has its edges in this array in the range [4*i, 4*i + 4); private readonly Edge[] _compactEdges; private readonly BuilderNode[] _builderNodes; public Builder(IEnumerable<string> values) { // TODO(cyrusn): Properly handle unicode normalization here. var distinctValues = values.Where(v => v.Length > 0).Distinct(CaseInsensitiveComparison.Comparer).ToArray(); var charCount = distinctValues.Sum(v => v.Length); _concatenatedLowerCaseWords = new char[charCount]; _wordSpans = new TextSpan[distinctValues.Length]; var characterIndex = 0; for (var i = 0; i < distinctValues.Length; i++) { var value = distinctValues[i]; _wordSpans[i] = new TextSpan(characterIndex, value.Length); foreach (var ch in value) { _concatenatedLowerCaseWords[characterIndex] = CaseInsensitiveComparison.ToLower(ch); characterIndex++; } } // We will have one node for each string value that we are adding. _builderNodes = new BuilderNode[distinctValues.Length]; _compactEdges = new Edge[distinctValues.Length * CompactEdgeAllocationSize]; } internal BKTree Create() { for (var i = 0; i < _wordSpans.Length; i++) { Add(_wordSpans[i], insertionIndex: i); } var nodes = ImmutableArray.CreateBuilder<Node>(_builderNodes.Length); // There will be one less edge in the graph than nodes. Each node (except for the // root) will have a single edge pointing to it. var edges = ImmutableArray.CreateBuilder<Edge>(Math.Max(0, _builderNodes.Length - 1)); BuildArrays(nodes, edges); return new BKTree(_concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable()); } private void BuildArrays(ImmutableArray<Node>.Builder nodes, ImmutableArray<Edge>.Builder edges) { var currentEdgeIndex = 0; for (var i = 0; i < _builderNodes.Length; i++) { var builderNode = _builderNodes[i]; var edgeCount = builderNode.EdgeCount; nodes.Add(new Node(builderNode.CharacterSpan, edgeCount, currentEdgeIndex)); if (edgeCount > 0) { // First, copy any edges that are in the compact array. var start = i * CompactEdgeAllocationSize; var end = start + Math.Min(edgeCount, CompactEdgeAllocationSize); for (var j = start; j < end; j++) { edges.Add(_compactEdges[j]); } // Then, if we've spilled over any edges, copy them as well. var spilledEdges = builderNode.SpilloverEdges; if (spilledEdges != null) { Debug.Assert(spilledEdges.Count == (edgeCount - CompactEdgeAllocationSize)); foreach (var (distance, childIndex) in spilledEdges) edges.Add(new Edge(distance, childIndex)); } } currentEdgeIndex += edgeCount; } Debug.Assert(currentEdgeIndex == edges.Capacity); Debug.Assert(currentEdgeIndex == edges.Count); } private void Add(TextSpan characterSpan, int insertionIndex) { if (insertionIndex == 0) { _builderNodes[insertionIndex] = new BuilderNode(characterSpan); return; } var currentNodeIndex = 0; while (true) { var currentNode = _builderNodes[currentNodeIndex]; // Determine the edit distance between these two words. Note: we do not use // a threshold here as we need the actual edit distance so we can actually // determine what edge to make or walk. var editDistance = EditDistance.GetEditDistance( _concatenatedLowerCaseWords.AsSpan(currentNode.CharacterSpan.Start, currentNode.CharacterSpan.Length), _concatenatedLowerCaseWords.AsSpan(characterSpan.Start, characterSpan.Length)); if (editDistance == 0) { // This should never happen. We dedupe all items before proceeding to the 'Add' step. // So the edit distance should always be non-zero. throw new InvalidOperationException(); } if (TryGetChildIndex(currentNode, currentNodeIndex, editDistance, out var childNodeIndex)) { // Edit distances collide. Move to this child and add this word to it. currentNodeIndex = childNodeIndex; continue; } // found the node we want to add the child node to. AddChildNode(characterSpan, insertionIndex, currentNode.EdgeCount, currentNodeIndex, editDistance); return; } } private void AddChildNode( TextSpan characterSpan, int insertionIndex, int currentNodeEdgeCount, int currentNodeIndex, int editDistance) { // The node as 'currentNodeIndex' doesn't have an edge with this edit distance. // Three cases to handle: // 1) there are less than 4 edges. We simply place the edge into the correct // location in compactEdges // 2) there are 4 edges. We need to make the spillover dictionary and then add // the new edge into that. // 3) there are more than 4 edges. Just put the new edge in the spillover // dictionary. if (currentNodeEdgeCount < CompactEdgeAllocationSize) { _compactEdges[currentNodeIndex * CompactEdgeAllocationSize + currentNodeEdgeCount] = new Edge(editDistance, insertionIndex); } else { ref var node = ref _builderNodes[currentNodeIndex]; // When we hit 4 elements, we need to allocate the spillover dictionary to // place the extra edges. if (currentNodeEdgeCount == CompactEdgeAllocationSize) { RoslynDebug.Assert(node.SpilloverEdges is null); var spilloverEdges = new Dictionary<int, int>(); node.SpilloverEdges = spilloverEdges; } else { RoslynDebug.AssertNotNull(node.SpilloverEdges); } node.SpilloverEdges.Add(editDistance, insertionIndex); } _builderNodes[currentNodeIndex].EdgeCount++; _builderNodes[insertionIndex] = new BuilderNode(characterSpan); return; } private bool TryGetChildIndex(BuilderNode currentNode, int currentNodeIndex, int editDistance, out int childIndex) { // linearly scan the children we have to see if there is one with this edit distance. var start = currentNodeIndex * CompactEdgeAllocationSize; var end = start + Math.Min(currentNode.EdgeCount, CompactEdgeAllocationSize); for (var i = start; i < end; i++) { if (_compactEdges[i].EditDistance == editDistance) { childIndex = _compactEdges[i].ChildNodeIndex; return true; } } // If we've spilled over any edges, check there as well if (currentNode.SpilloverEdges != null) { // Can't use the compact array. Have to use the spillover dictionary instead. Debug.Assert(currentNode.SpilloverEdges.Count == (currentNode.EdgeCount - CompactEdgeAllocationSize)); return currentNode.SpilloverEdges.TryGetValue(editDistance, out childIndex); } childIndex = -1; return false; } private struct BuilderNode(TextSpan characterSpan) { public readonly TextSpan CharacterSpan = characterSpan; public int EdgeCount; public Dictionary<int, int>? SpilloverEdges; } } } |