|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.Text;
namespace Roslyn.Utilities;
internal readonly partial struct BKTree
{
private readonly struct Builder
{
// The number of edges we pre-allocate space for for each node in _compactEdges.
//
// To make the comments simpler below, i'll use '4' as a synonym for CompactEdgeAllocationSize.
// '4' simply reads better and makes it clearer what's going on.
private const int CompactEdgeAllocationSize = 4;
// Instead of producing a char[] for each string we're building a node for, we instead
// have one long char[] with all the chracters of each string concatenated. i.e.
// "goo" "bar" and "baz" becomes { f, o, o, b, a, r, b, a, z }. Then in _wordSpans
// we have the text spans for each of those words in this array. This gives us only
// two allocations instead of as many allocations as the number of strings we have.
//
// Once we are done building, we pass this to the BKTree and its nodes also state the
// span of this array that corresponds to the word they were created for. This works
// well as other dependent facilities (like EditDistance) can work on sub-arrays without
// any problems.
private readonly char[] _concatenatedLowerCaseWords;
private readonly TextSpan[] _wordSpans;
// Note: while building a BKTree we have to store children with parents, keyed by the
// edit distance between the two. Naive implementations might store a list or dictionary
// of children along with each node. However, this would be very inefficient and would
// put an enormous amount of memory pressure on the system.
//
// Emperical data for a nice large assembly like mscorlib gives us the following
// information:
//
// Unique-Words (ignoring case): 9662
//
// For each unique word we need a node in the BKTree. If we stored a list or dictionary
// with each node, that would be 10s of thousands of objects created that would then
// just have to be GCed. That's a lot of garbage pressure we'd like to avoid.
//
// Now if we look at all those nodes, we can see the following information about how many
// children each has.
//
// Edge counts:
// 0 5560
// 1 1884
// 2 887
// 3 527
// 4 322
// 5 200
// 6 114
// 7 69
// 8 47
// 9 20
// 10 8
// 11 10
// 12 7
// 13 4
// 15 1
// 16 1
// 54 1
//
//
// i.e. The number of nodes with edge-counts less than or equal to four is: 5560+1884+887+527+322=9180.
// This is 95% of the total number of edges we are adding. Looking at many other dlls
// we found that this ratio stays true across the board. i.e. with all dlls, 95% of nodes
// have 4 or less edges.
//
// So, to optimize things, we pre-alloc a single array with space for 4 edges for each
// node we're going to add. Each node then gets that much space to store edge information.
// If it needs more than that space, then we have a fall-over dictionary that it can store
// information in.
//
// Once building is complete, the GC only needs to deallocate this single array and the
// spillover dictionaries.
//
// This approach produces 1/20th the amount of garbage while building the tree.
//
// Each node at index i has its edges in this array in the range [4*i, 4*i + 4);
private readonly Edge[] _compactEdges;
private readonly BuilderNode[] _builderNodes;
public Builder(IEnumerable<string> values)
{
// TODO(cyrusn): Properly handle unicode normalization here.
var distinctValues = values.Where(v => v.Length > 0).Distinct(CaseInsensitiveComparison.Comparer).ToArray();
var charCount = distinctValues.Sum(v => v.Length);
_concatenatedLowerCaseWords = new char[charCount];
_wordSpans = new TextSpan[distinctValues.Length];
var characterIndex = 0;
for (var i = 0; i < distinctValues.Length; i++)
{
var value = distinctValues[i];
_wordSpans[i] = new TextSpan(characterIndex, value.Length);
foreach (var ch in value)
{
_concatenatedLowerCaseWords[characterIndex] = CaseInsensitiveComparison.ToLower(ch);
characterIndex++;
}
}
// We will have one node for each string value that we are adding.
_builderNodes = new BuilderNode[distinctValues.Length];
_compactEdges = new Edge[distinctValues.Length * CompactEdgeAllocationSize];
}
internal BKTree Create()
{
for (var i = 0; i < _wordSpans.Length; i++)
{
Add(_wordSpans[i], insertionIndex: i);
}
var nodes = ImmutableArray.CreateBuilder<Node>(_builderNodes.Length);
// There will be one less edge in the graph than nodes. Each node (except for the
// root) will have a single edge pointing to it.
var edges = ImmutableArray.CreateBuilder<Edge>(Math.Max(0, _builderNodes.Length - 1));
BuildArrays(nodes, edges);
return new BKTree(_concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable());
}
private void BuildArrays(ImmutableArray<Node>.Builder nodes, ImmutableArray<Edge>.Builder edges)
{
var currentEdgeIndex = 0;
for (var i = 0; i < _builderNodes.Length; i++)
{
var builderNode = _builderNodes[i];
var edgeCount = builderNode.EdgeCount;
nodes.Add(new Node(builderNode.CharacterSpan, edgeCount, currentEdgeIndex));
if (edgeCount > 0)
{
// First, copy any edges that are in the compact array.
var start = i * CompactEdgeAllocationSize;
var end = start + Math.Min(edgeCount, CompactEdgeAllocationSize);
for (var j = start; j < end; j++)
{
edges.Add(_compactEdges[j]);
}
// Then, if we've spilled over any edges, copy them as well.
var spilledEdges = builderNode.SpilloverEdges;
if (spilledEdges != null)
{
Debug.Assert(spilledEdges.Count == (edgeCount - CompactEdgeAllocationSize));
foreach (var (distance, childIndex) in spilledEdges)
edges.Add(new Edge(distance, childIndex));
}
}
currentEdgeIndex += edgeCount;
}
Debug.Assert(currentEdgeIndex == edges.Capacity);
Debug.Assert(currentEdgeIndex == edges.Count);
}
private void Add(TextSpan characterSpan, int insertionIndex)
{
if (insertionIndex == 0)
{
_builderNodes[insertionIndex] = new BuilderNode(characterSpan);
return;
}
var currentNodeIndex = 0;
while (true)
{
var currentNode = _builderNodes[currentNodeIndex];
// Determine the edit distance between these two words. Note: we do not use
// a threshold here as we need the actual edit distance so we can actually
// determine what edge to make or walk.
var editDistance = EditDistance.GetEditDistance(
_concatenatedLowerCaseWords.AsSpan(currentNode.CharacterSpan.Start, currentNode.CharacterSpan.Length),
_concatenatedLowerCaseWords.AsSpan(characterSpan.Start, characterSpan.Length));
if (editDistance == 0)
{
// This should never happen. We dedupe all items before proceeding to the 'Add' step.
// So the edit distance should always be non-zero.
throw new InvalidOperationException();
}
if (TryGetChildIndex(currentNode, currentNodeIndex, editDistance, out var childNodeIndex))
{
// Edit distances collide. Move to this child and add this word to it.
currentNodeIndex = childNodeIndex;
continue;
}
// found the node we want to add the child node to.
AddChildNode(characterSpan, insertionIndex, currentNode.EdgeCount, currentNodeIndex, editDistance);
return;
}
}
private void AddChildNode(
TextSpan characterSpan, int insertionIndex, int currentNodeEdgeCount, int currentNodeIndex, int editDistance)
{
// The node as 'currentNodeIndex' doesn't have an edge with this edit distance.
// Three cases to handle:
// 1) there are less than 4 edges. We simply place the edge into the correct
// location in compactEdges
// 2) there are 4 edges. We need to make the spillover dictionary and then add
// the new edge into that.
// 3) there are more than 4 edges. Just put the new edge in the spillover
// dictionary.
if (currentNodeEdgeCount < CompactEdgeAllocationSize)
{
_compactEdges[currentNodeIndex * CompactEdgeAllocationSize + currentNodeEdgeCount] =
new Edge(editDistance, insertionIndex);
}
else
{
ref var node = ref _builderNodes[currentNodeIndex];
// When we hit 4 elements, we need to allocate the spillover dictionary to
// place the extra edges.
if (currentNodeEdgeCount == CompactEdgeAllocationSize)
{
RoslynDebug.Assert(node.SpilloverEdges is null);
var spilloverEdges = new Dictionary<int, int>();
node.SpilloverEdges = spilloverEdges;
}
else
{
RoslynDebug.AssertNotNull(node.SpilloverEdges);
}
node.SpilloverEdges.Add(editDistance, insertionIndex);
}
_builderNodes[currentNodeIndex].EdgeCount++;
_builderNodes[insertionIndex] = new BuilderNode(characterSpan);
return;
}
private bool TryGetChildIndex(BuilderNode currentNode, int currentNodeIndex, int editDistance, out int childIndex)
{
// linearly scan the children we have to see if there is one with this edit distance.
var start = currentNodeIndex * CompactEdgeAllocationSize;
var end = start + Math.Min(currentNode.EdgeCount, CompactEdgeAllocationSize);
for (var i = start; i < end; i++)
{
if (_compactEdges[i].EditDistance == editDistance)
{
childIndex = _compactEdges[i].ChildNodeIndex;
return true;
}
}
// If we've spilled over any edges, check there as well
if (currentNode.SpilloverEdges != null)
{
// Can't use the compact array. Have to use the spillover dictionary instead.
Debug.Assert(currentNode.SpilloverEdges.Count == (currentNode.EdgeCount - CompactEdgeAllocationSize));
return currentNode.SpilloverEdges.TryGetValue(editDistance, out childIndex);
}
childIndex = -1;
return false;
}
private struct BuilderNode(TextSpan characterSpan)
{
public readonly TextSpan CharacterSpan = characterSpan;
public int EdgeCount;
public Dictionary<int, int>? SpilloverEdges;
}
}
}
|