|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using System.Threading;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.PooledObjects;
namespace Roslyn.Utilities;
///<summary>
/// NOTE: Only use if you truly need an edit distance. If you just want to compare words, use
/// the 'SpellChecker' type instead.
///
/// Implementation of the Damerau-Levenshtein edit distance algorithm from:
/// An Extension of the String-to-String Correction Problem:
/// Published in Journal of the ACM (JACM)
/// Volume 22 Issue 2, April 1975.
///
/// Important, unlike many edit distance algorithms out there, this one implements a true metric
/// that satisfies the triangle inequality. (Unlike the "Optimal String Alignment" or "Restricted
/// string edit distance" solutions which do not). This means this edit distance can be used in
/// other domains that require the triangle inequality (like BKTrees).
///
/// Specifically, this implementation satisfies the following inequality: D(x, y) + D(y, z) >= D(x, z)
/// (where D is the edit distance).
///</summary>
internal readonly struct EditDistance(string text) : IDisposable
{
// Our edit distance algorithm makes use of an 'infinite' value. A value so high that it
// could never participate in an edit distance (and effectively means the path through it
// is dead).
//
// We do *not* represent this with "int.MaxValue" due to the presence of certain addition
// operations in the edit distance algorithm. These additions could cause int.MaxValue
// to roll over to a very negative value (which would then look like the lowest cost
// path).
//
// So we pick a value that is both effectively larger than any possible edit distance,
// and also has no chance of overflowing.
private const int Infinity = int.MaxValue >> 1;
public const int BeyondThreshold = int.MaxValue;
private readonly string _source = text ?? throw new ArgumentNullException(nameof(text));
private readonly char[] _sourceLowerCaseCharacters = ConvertToLowercaseArray(text);
private const int PooledArraySize = 512;
private static readonly ObjectPool<char[]> s_pool = new ObjectPool<char[]>(() => new char[PooledArraySize]);
private static char[] ConvertToLowercaseArray(string text)
{
var array = text.Length <= PooledArraySize
? s_pool.Allocate()
: new char[text.Length];
for (var i = 0; i < text.Length; i++)
array[i] = CaseInsensitiveComparison.ToLower(text[i]);
return array;
}
public void Dispose()
{
if (_sourceLowerCaseCharacters == null)
throw new ObjectDisposedException(nameof(EditDistance));
// Place properly sized arrays back in the pool. As these only store chars, no need
// to clear them out before doing so.
if (_sourceLowerCaseCharacters.Length == PooledArraySize)
s_pool.Free(_sourceLowerCaseCharacters);
}
public static int GetEditDistance(string source, string target, int threshold = int.MaxValue)
{
using var editDistance = new EditDistance(source);
return editDistance.GetEditDistance(target, threshold);
}
public int GetEditDistance(string target, int threshold = int.MaxValue)
{
if (_sourceLowerCaseCharacters == null)
throw new ObjectDisposedException(nameof(EditDistance));
Span<char> targetLowerCaseCharacters = target.Length < 512
? stackalloc char[target.Length]
: new char[target.Length];
for (var i = 0; i < target.Length; i++)
targetLowerCaseCharacters[i] = CaseInsensitiveComparison.ToLower(target[i]);
return GetEditDistance(
_sourceLowerCaseCharacters.AsSpan(0, _source.Length),
targetLowerCaseCharacters,
threshold);
}
private const int MaxMatrixPoolDimension = 64;
private static readonly ThreadLocal<int[,]> t_matrixPool =
new(() => InitializeMatrix(new int[MaxMatrixPoolDimension, MaxMatrixPoolDimension]));
// To find swapped characters we make use of a table that keeps track of the last location
// we found that character. For performance reasons we only do this work for ascii characters
// (i.e. with value <= 127). This allows us to just use a simple array we can index into instead
// of needing something more expensive like a dictionary.
private const int LastSeenIndexLength = 128;
private static readonly ThreadLocal<int[]> t_lastSeenIndexPool =
new(() => new int[LastSeenIndexLength]);
private static int[,] GetMatrix(int width, int height)
{
if (width > MaxMatrixPoolDimension || height > MaxMatrixPoolDimension)
{
return InitializeMatrix(new int[width, height]);
}
return t_matrixPool.Value!;
}
private static int[,] InitializeMatrix(int[,] matrix)
{
// All matrices share the following in common:
//
// ------------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6 7
// |∞ 1
// |∞ 2
// |∞ 3
// |∞ 4
// |∞ 5
// |∞ 6
// |∞ 7
//
// So we initialize this once when the matrix is created. For pooled arrays we only
// have to do this once, and it will retain this layout for all future computations.
var width = matrix.GetLength(0);
var height = matrix.GetLength(1);
for (var i = 0; i < width; i++)
{
matrix[i, 0] = Infinity;
if (i < width - 1)
{
matrix[i + 1, 1] = i;
}
}
for (var j = 0; j < height; j++)
{
matrix[0, j] = Infinity;
if (j < height - 1)
{
matrix[1, j + 1] = j;
}
}
return matrix;
}
public static int GetEditDistance(ReadOnlySpan<char> source, ReadOnlySpan<char> target, int threshold = int.MaxValue)
{
return source.Length <= target.Length
? GetEditDistanceWorker(source, target, threshold)
: GetEditDistanceWorker(target, source, threshold);
}
private static int GetEditDistanceWorker(ReadOnlySpan<char> source, ReadOnlySpan<char> target, int threshold)
{
// Note: sourceLength will always be smaller or equal to targetLength.
//
// Also Note: sourceLength and targetLength values will mutate and represent the lengths
// of the portions of the arrays we want to compare. However, even after mutation, hte
// invariant that sourceLength is <= targetLength will remain.
Debug.Assert(source.Length <= target.Length);
// First:
// Determine the common prefix/suffix portions of the strings. We don't even need to
// consider them as they won't add anything to the edit cost.
while (source.Length > 0 && source[^1] == target[^1])
{
source = source[..^1];
target = target[..^1];
}
while (source.Length > 0 && source[0] == target[0])
{
source = source[1..];
target = target[1..];
}
// 'sourceLength' and 'targetLength' are now the lengths of the substrings of our strings that we
// want to compare. 'startIndex' is the starting point of the substrings in both array.
//
// If we've matched all of the 'source' string in the prefix and suffix of 'target'. then the edit
// distance is just whatever operations we have to create the remaining target substring.
//
// Note: we don't have to check if targetLength is 0. That's because targetLength being zero would
// necessarily mean that sourceLength is 0.
var sourceLength = source.Length;
var targetLength = target.Length;
if (sourceLength == 0)
{
return targetLength <= threshold ? targetLength : BeyondThreshold;
}
// The is the minimum number of edits we'd have to make. i.e. if 'source' and
// 'target' are the same length, then we might not need to make any edits. However,
// if target has length 10 and source has length 7, then we're going to have to
// make at least 3 edits no matter what.
var minimumEditCount = targetLength - sourceLength;
Debug.Assert(minimumEditCount >= 0);
// If the number of edits we'd have to perform is greater than our threshold, then
// there's no point in even continuing.
if (minimumEditCount > threshold)
{
return BeyondThreshold;
}
// Say we want to find the edit distance between "sunday" and "saturday". Our initial
// matrix will be:
//
// (Note: for purposes of this explanation we will not be trimming off the common
// prefix/suffix of the strings. That optimization does not affect any of the
// remainder of the explanation).
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3
// u |∞ 4
// r |∞ 5
// d |∞ 6
// a |∞ 7
// y |∞ 8
//
// Note that the matrix will always be square, or a rectangle that is taller htan it is
// longer. Our 'source' is at the top, and our 'target' is on the left. The edit distance
// between any prefix of 'source' and any prefix of 'target' can then be found in
// the unfilled area of the matrix. Specifically, if we have source.substring(0, m) and
// target.substring(0, n), then the edit distance for them can be found at matrix position
// (m+1, n+1). This is why the 1'th row and 1'th column can be prefilled. They represent
// the cost to go from the empty target to the full source or the empty source to the full
// target (respectively). So, if we wanted to know the edit distance between "sun" and
// "sat", we'd look at (3+1, 3+1). It then follows that our final edit distance between
// the full source and target is in the lower right corner of this matrix.
//
// If we fill out the matrix fully we'll get:
//
// s u n d a y <-- source
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 0 1 2 3 4 5
// a |∞ 2 1 1 2 3 3 4
// t |∞ 3 2 2 2 3 4 4
// u |∞ 4 3 2 3 3 4 5
// r |∞ 5 4 3 3 4 4 5
// d |∞ 6 5 4 4 3 4 5
// a |∞ 7 6 5 5 4 3 4
// y |∞ 8 7 6 6 5 4 3 <--
// ^
// |
//
// So in this case, the edit distance is 3. Or, specifically, the edits:
//
// Sunday -> Replace("n", "r") ->
// Surday -> Insert("a") ->
// Saurday -> Insert("t") ->
// Saturday
//
//
// Now: in the case where we want to know what the edit distance actually is (for example
// when making a BKTree), we must fill out this entire array to get the true edit distance.
//
// However, in some cases we can do a bit better. For example, if a client only wants to
// the edit distance *when the edit distance will be less than some threshold* then we do
// not need to examine the entire matrix. We only want to examine until the point where
// we realize that, no matter what, our final edit distance will be more than that threshold
// (at which point we can return early).
//
// Some things are trivially easy to check. First, the edit distance between two strings is at
// *best* the difference of their lengths. i.e. if i have "aa" and "aaaaa" then the edit
// distance is 3 (the difference of 5 and 2). If our threshold is less then 3 then there
// is no way these two strings could match. So we can leave early if we can tell it would
// simply be impossible to get an edit distance within the specified threshold.
//
// Second, let's look at our matrix again:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3
// u |∞ 4
// r |∞ 5
// d |∞ 6
// a |∞ 7
// y |∞ 8 *
//
// We want to know what the value is at *, and we want to stop as early as possible if it
// is greater than our threshold.
//
// Given the edit distance rules we observe edit distance at any point (i,j) in the matrix will
// always be greater than or equal to the value in (i-1, j-1). i.e. the edit distance of
// any two strings is going to be *at best* equal to the edit distance of those two strings
// without their final characters. If their final characters are the same, they'll have the
// same edit distance. If they are different, the edit distance will be greater. Given
// that we know the final edit distance is in the lower right, we can discover something
// useful in the matrix.
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3 `
// u |∞ 4 `
// r |∞ 5 `
// d |∞ 6 `
// a |∞ 7 `
// y |∞ 8 *
//
// The slashes are the "bottom" diagonal leading to the lower right. The value in the
// lower right will be strictly equal to or greater than any value on this diagonal.
// Thus, if that value exceeds the threshold, we know we can stop immediately as the
// total edit distance must be greater than the threshold.
//
// We can use similar logic to avoid even having to examine more of the matrix when we
// have a threshold. First, consider the same diagonal.
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3 `
// u |∞ 4 ` x
// r |∞ 5 ` |
// d |∞ 6 ` |
// a |∞ 7 ` |
// y |∞ 8 *
//
// And then consider a point above that diagonal (indicated by x). In the example
// above, the edit distance to * from 'x' will be (x+4). If, for example, threshold
// was '2', then it would be impossible for the path from 'x' to provide a good
// enough edit distance *ever*. Similarly:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3 `
// u |∞ 4 `
// r |∞ 5 `
// d |∞ 6 `
// a |∞ 7 `
// y |∞ 8 y - - *
//
// Here we see that the final edit distance will be "y+3". Again, if the edit
// distance threshold is less than 3, then no path from y will provide a good
// enough edit distance.
//
// So, if we had an edit distance threshold of 3, then the range around that
// bottom diagonal that we should consider checking is:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 | |
// a |∞ 2 | | |
// t |∞ 3 ` | | |
// u |∞ 4 - ` | | |
// r |∞ 5 - - ` | | |
// d |∞ 6 - - - ` | |
// a |∞ 7 - - - ` |
// y |∞ 8 - - - *
//
// Now, also consider that it will take a minimum of targetLength-sourceLength edits
// just to move to the lower diagonal from the upper diagonal. That leaves
// 'threshold - (targetLength - sourceLength)' edits remaining. In this example, that
// means '3 - (8 - 6)' = 1. Because of this our lower diagonal offset is capped at:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 | |
// a |∞ 2 | | |
// t |∞ 3 ` | | |
// u |∞ 4 - ` | | |
// r |∞ 5 - ` | | |
// d |∞ 6 - ` | |
// a |∞ 7 - ` |
// y |∞ 8 - *
//
// If we mark the upper diagonal appropriately we see the matrix as:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 ` |
// a |∞ 2 ` |
// t |∞ 3 ` ` |
// u |∞ 4 - ` ` |
// r |∞ 5 - ` ` |
// d |∞ 6 - ` `
// a |∞ 7 - `
// y |∞ 8 - *
//
// Or, effectively, we only need to examine 'threshold - (targetLength - sourceLength)'
// above and below the diagonals.
//
// In practice, when a threshold is provided it is normally capped at '2'. Given that,
// the most around the diagonal we'll ever have to check is +/- 2 elements. i.e. with
// strings of length 10 we'd only check:
//
// a b c d e f g h i j
// ------------------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6 7 8 9 10
// m |∞ 1 * * *
// n |∞ 2 * * * *
// o |∞ 3 * * * * *
// p |∞ 4 * * * * *
// q |∞ 5 * * * * *
// r |∞ 6 * * * * *
// s |∞ 7 * * * * *
// t |∞ 8 * * * * *
// u |∞ 9 * * * *
// v |∞10 * * *
//
// or 10+18+16=44. Or only 44%. if our threshold is two and our strings differ by length
// 2 then we have:
//
// a b c d e f g h
// --------------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6 7 8
// m |∞ 1 *
// n |∞ 2 * *
// o |∞ 3 * * *
// p |∞ 4 * * *
// q |∞ 5 * * *
// r |∞ 6 * * *
// s |∞ 7 * * *
// t |∞ 8 * * *
// u |∞ 9 * *
// v |∞10 *
//
// Then we examine 8+8+8=24 out of 80, or only 30% of the matrix. As the strings
// get larger, the savings increase as well.
// --------------------------------------------------------------------------------
// The highest cost it can be to convert a source to target is targetLength. i.e.
// changing all the characters in source to target (which would be be 'sourceLength'
// changes), and then adding all the missing characters in 'target' (which is
// 'targetLength' - 'sourceLength' changes). Combined that's 'targetLength'.
//
// So we can just cap our threshold here. This makes some of the walking code
// below simpler.
threshold = Math.Min(threshold, targetLength);
var offset = threshold - minimumEditCount;
Debug.Assert(offset >= 0);
var matrix = GetMatrix(sourceLength + 2, targetLength + 2);
var characterToLastSeenIndex_inSource = t_lastSeenIndexPool.Value!;
Array.Clear(characterToLastSeenIndex_inSource, 0, LastSeenIndexLength);
for (var i = 1; i <= sourceLength; i++)
{
var lastMatchIndex_inTarget = 0;
var sourceChar = source[i - 1];
// Determinethe portion of the column we actually want to examine.
var jStart = Math.Max(1, i - offset);
var jEnd = Math.Min(targetLength, i + minimumEditCount + offset);
// If we're examining only a subportion of the column, then we need to make sure
// that the values outside that range are set to Infinity. That way we don't
// consider them when we look through edit paths from above (for this column) or
// from the left (for the next column).
if (jStart > 1)
{
matrix[i + 1, jStart] = Infinity;
}
if (jEnd < targetLength)
{
matrix[i + 1, jEnd + 2] = Infinity;
}
for (var j = jStart; j <= jEnd; j++)
{
var targetChar = target[j - 1];
var i1 = targetChar < LastSeenIndexLength ? characterToLastSeenIndex_inSource[targetChar] : 0;
var j1 = lastMatchIndex_inTarget;
var matched = sourceChar == targetChar;
if (matched)
{
lastMatchIndex_inTarget = j;
}
matrix[i + 1, j + 1] = Min(
matrix[i, j] + (matched ? 0 : 1),
matrix[i + 1, j] + 1,
matrix[i, j + 1] + 1,
matrix[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
}
if (sourceChar < LastSeenIndexLength)
{
characterToLastSeenIndex_inSource[sourceChar] = i;
}
// Recall that minimumEditCount is simply the difference in length of our two
// strings. So matrix[i+1,i+1] is the cost for the upper-left diagonal of the
// matrix. matrix[i+1,i+1+minimumEditCount] is the cost for the lower right diagonal.
// Here we are simply getting the lowest cost edit of hese two substrings so far.
// If this lowest cost edit is greater than our threshold, then there is no need
// to proceed.
if (matrix[i + 1, i + minimumEditCount + 1] > threshold)
{
return BeyondThreshold;
}
}
return matrix[sourceLength + 1, targetLength + 1];
}
private static string ToString(int[,] matrix, int width, int height)
{
var sb = new StringBuilder();
for (var j = 0; j < height; j++)
{
for (var i = 0; i < width; i++)
{
var v = matrix[i + 2, j + 2];
sb.Append((v == Infinity ? "∞" : v.ToString()) + " ");
}
sb.AppendLine();
}
return sb.ToString().Trim();
}
private static int GetValue(Dictionary<char, int> da, char c)
=> da.TryGetValue(c, out var value) ? value : 0;
private static int Min(int v1, int v2, int v3, int v4)
{
Debug.Assert(v1 >= 0);
Debug.Assert(v2 >= 0);
Debug.Assert(v3 >= 0);
Debug.Assert(v4 >= 0);
var min = v1;
if (v2 < min)
{
min = v2;
}
if (v3 < min)
{
min = v3;
}
if (v4 < min)
{
min = v4;
}
Debug.Assert(min >= 0);
return min;
}
private static void SetValue(int[,] matrix, int i, int j, int val)
{
// Matrix is -1 based, so we add 1 to both i and j to make it
// possible to index into the actual storage.
matrix[i + 1, j + 1] = val;
}
}
|