EditDistance.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using System.Threading;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.PooledObjects;
 
namespace Roslyn.Utilities;
 
///<summary>
/// NOTE: Only use if you truly need an edit distance.  If you just want to compare words, use
/// the 'SpellChecker' type instead.
///
/// Implementation of the Damerau-Levenshtein edit distance algorithm from:
/// An Extension of the String-to-String Correction Problem:
/// Published in Journal of the ACM (JACM)
/// Volume 22 Issue 2, April 1975.
///
/// Important, unlike many edit distance algorithms out there, this one implements a true metric
/// that satisfies the triangle inequality.  (Unlike the "Optimal String Alignment" or "Restricted
/// string edit distance" solutions which do not).  This means this edit distance can be used in
/// other domains that require the triangle inequality (like BKTrees).
///
/// Specifically, this implementation satisfies the following inequality: D(x, y) + D(y, z) >= D(x, z)
/// (where D is the edit distance).
///</summary> 
internal readonly struct EditDistance(string text) : IDisposable
{
    // Our edit distance algorithm makes use of an 'infinite' value.  A value so high that it 
    // could never participate in an edit distance (and effectively means the path through it
    // is dead).
    //
    // We do *not* represent this with "int.MaxValue" due to the presence of certain addition
    // operations in the edit distance algorithm. These additions could cause int.MaxValue
    // to roll over to a very negative value (which would then look like the lowest cost
    // path).
    //
    // So we pick a value that is both effectively larger than any possible edit distance,
    // and also has no chance of overflowing.
    private const int Infinity = int.MaxValue >> 1;
 
    public const int BeyondThreshold = int.MaxValue;
 
    private readonly string _source = text ?? throw new ArgumentNullException(nameof(text));
    private readonly char[] _sourceLowerCaseCharacters = ConvertToLowercaseArray(text);
 
    private const int PooledArraySize = 512;
    private static readonly ObjectPool<char[]> s_pool = new ObjectPool<char[]>(() => new char[PooledArraySize]);
 
    private static char[] ConvertToLowercaseArray(string text)
    {
        var array = text.Length <= PooledArraySize
            ? s_pool.Allocate()
            : new char[text.Length];
 
        for (var i = 0; i < text.Length; i++)
            array[i] = CaseInsensitiveComparison.ToLower(text[i]);
 
        return array;
    }
 
    public void Dispose()
    {
        if (_sourceLowerCaseCharacters == null)
            throw new ObjectDisposedException(nameof(EditDistance));
 
        // Place properly sized arrays back in the pool. As these only store chars, no need
        // to clear them out before doing so.
        if (_sourceLowerCaseCharacters.Length == PooledArraySize)
            s_pool.Free(_sourceLowerCaseCharacters);
    }
 
    public static int GetEditDistance(string source, string target, int threshold = int.MaxValue)
    {
        using var editDistance = new EditDistance(source);
        return editDistance.GetEditDistance(target, threshold);
    }
 
    public int GetEditDistance(string target, int threshold = int.MaxValue)
    {
        if (_sourceLowerCaseCharacters == null)
            throw new ObjectDisposedException(nameof(EditDistance));
 
        Span<char> targetLowerCaseCharacters = target.Length < 512
            ? stackalloc char[target.Length]
            : new char[target.Length];
 
        for (var i = 0; i < target.Length; i++)
            targetLowerCaseCharacters[i] = CaseInsensitiveComparison.ToLower(target[i]);
 
        return GetEditDistance(
            _sourceLowerCaseCharacters.AsSpan(0, _source.Length),
            targetLowerCaseCharacters,
            threshold);
    }
 
    private const int MaxMatrixPoolDimension = 64;
    private static readonly ThreadLocal<int[,]> t_matrixPool =
        new(() => InitializeMatrix(new int[MaxMatrixPoolDimension, MaxMatrixPoolDimension]));
 
    // To find swapped characters we make use of a table that keeps track of the last location
    // we found that character.  For performance reasons we only do this work for ascii characters
    // (i.e. with value <= 127).  This allows us to just use a simple array we can index into instead
    // of needing something more expensive like a dictionary.
    private const int LastSeenIndexLength = 128;
    private static readonly ThreadLocal<int[]> t_lastSeenIndexPool =
        new(() => new int[LastSeenIndexLength]);
 
    private static int[,] GetMatrix(int width, int height)
    {
        if (width > MaxMatrixPoolDimension || height > MaxMatrixPoolDimension)
        {
            return InitializeMatrix(new int[width, height]);
        }
 
        return t_matrixPool.Value!;
    }
 
    private static int[,] InitializeMatrix(int[,] matrix)
    {
        // All matrices share the following in common:
        //
        // ------------------
        // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        // |∞ 0 1 2 3 4 5 6 7
        // |∞ 1
        // |∞ 2
        // |∞ 3
        // |∞ 4
        // |∞ 5
        // |∞ 6
        // |∞ 7
        //
        // So we initialize this once when the matrix is created.  For pooled arrays we only
        // have to do this once, and it will retain this layout for all future computations.
 
        var width = matrix.GetLength(0);
        var height = matrix.GetLength(1);
 
        for (var i = 0; i < width; i++)
        {
            matrix[i, 0] = Infinity;
 
            if (i < width - 1)
            {
                matrix[i + 1, 1] = i;
            }
        }
 
        for (var j = 0; j < height; j++)
        {
            matrix[0, j] = Infinity;
 
            if (j < height - 1)
            {
                matrix[1, j + 1] = j;
            }
        }
 
        return matrix;
    }
 
    public static int GetEditDistance(ReadOnlySpan<char> source, ReadOnlySpan<char> target, int threshold = int.MaxValue)
    {
        return source.Length <= target.Length
            ? GetEditDistanceWorker(source, target, threshold)
            : GetEditDistanceWorker(target, source, threshold);
    }
 
    private static int GetEditDistanceWorker(ReadOnlySpan<char> source, ReadOnlySpan<char> target, int threshold)
    {
        // Note: sourceLength will always be smaller or equal to targetLength.
        //
        // Also Note: sourceLength and targetLength values will mutate and represent the lengths 
        // of the portions of the arrays we want to compare.  However, even after mutation, hte
        // invariant that sourceLength is <= targetLength will remain.
        Debug.Assert(source.Length <= target.Length);
 
        // First:
        // Determine the common prefix/suffix portions of the strings.  We don't even need to 
        // consider them as they won't add anything to the edit cost.
        while (source.Length > 0 && source[^1] == target[^1])
        {
            source = source[..^1];
            target = target[..^1];
        }
 
        while (source.Length > 0 && source[0] == target[0])
        {
            source = source[1..];
            target = target[1..];
        }
 
        // 'sourceLength' and 'targetLength' are now the lengths of the substrings of our strings that we
        // want to compare. 'startIndex' is the starting point of the substrings in both array.
        //
        // If we've matched all of the 'source' string in the prefix and suffix of 'target'. then the edit
        // distance is just whatever operations we have to create the remaining target substring.
        //
        // Note: we don't have to check if targetLength is 0.  That's because targetLength being zero would
        // necessarily mean that sourceLength is 0.
        var sourceLength = source.Length;
        var targetLength = target.Length;
        if (sourceLength == 0)
        {
            return targetLength <= threshold ? targetLength : BeyondThreshold;
        }
 
        // The is the minimum number of edits we'd have to make.  i.e. if  'source' and 
        // 'target' are the same length, then we might not need to make any edits.  However,
        // if target has length 10 and source has length 7, then we're going to have to
        // make at least 3 edits no matter what.
        var minimumEditCount = targetLength - sourceLength;
        Debug.Assert(minimumEditCount >= 0);
 
        // If the number of edits we'd have to perform is greater than our threshold, then
        // there's no point in even continuing.
        if (minimumEditCount > threshold)
        {
            return BeyondThreshold;
        }
 
        // Say we want to find the edit distance between "sunday" and "saturday".  Our initial
        // matrix will be:
        //
        // (Note: for purposes of this explanation we will not be trimming off the common 
        // prefix/suffix of the strings.  That optimization does not affect any of the 
        // remainder of the explanation).
        //
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1 
        //    a |∞ 2
        //    t |∞ 3
        //    u |∞ 4
        //    r |∞ 5
        //    d |∞ 6
        //    a |∞ 7
        //    y |∞ 8
        //
        // Note that the matrix will always be square, or a rectangle that is taller htan it is 
        // longer.  Our 'source' is at the top, and our 'target' is on the left.  The edit distance
        // between any prefix of 'source' and any prefix of 'target' can then be found in 
        // the unfilled area of the matrix.  Specifically, if we have source.substring(0, m) and
        // target.substring(0, n), then the edit distance for them can be found at matrix position
        // (m+1, n+1).  This is why the 1'th row and 1'th column can be prefilled.  They represent
        // the cost to go from the empty target to the full source or the empty source to the full
        // target (respectively).  So, if we wanted to know the edit distance between "sun" and 
        // "sat", we'd look at (3+1, 3+1).  It then follows that our final edit distance between
        // the full source and target is in the lower right corner of this matrix.
        //
        // If we fill out the matrix fully we'll get:
        //          
        //           s u n d a y <-- source
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1 0 1 2 3 4 5 
        //    a |∞ 2 1 1 2 3 3 4 
        //    t |∞ 3 2 2 2 3 4 4 
        //    u |∞ 4 3 2 3 3 4 5 
        //    r |∞ 5 4 3 3 4 4 5 
        //    d |∞ 6 5 4 4 3 4 5 
        //    a |∞ 7 6 5 5 4 3 4 
        //    y |∞ 8 7 6 6 5 4 3 <--
        //                     ^
        //                     |
        //
        // So in this case, the edit distance is 3.  Or, specifically, the edits:
        //
        //      Sunday -> Replace("n", "r") -> 
        //      Surday -> Insert("a") ->
        //      Saurday -> Insert("t") ->
        //      Saturday
        //
        //
        // Now: in the case where we want to know what the edit distance actually is (for example
        // when making a BKTree), we must fill out this entire array to get the true edit distance.
        //
        // However, in some cases we can do a bit better.  For example, if a client only wants to
        // the edit distance *when the edit distance will be less than some threshold* then we do
        // not need to examine the entire matrix.  We only want to examine until the point where
        // we realize that, no matter what, our final edit distance will be more than that threshold
        // (at which point we can return early).
        //
        // Some things are trivially easy to check.  First, the edit distance between two strings is at
        // *best* the difference of their lengths.  i.e. if i have "aa" and "aaaaa" then the edit
        // distance is 3 (the difference of 5 and 2).  If our threshold is less then 3 then there
        // is no way these two strings could match.  So we can leave early if we can tell it would
        // simply be impossible to get an edit distance within the specified threshold.
        //
        // Second, let's look at our matrix again:
        //
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1 
        //    a |∞ 2
        //    t |∞ 3
        //    u |∞ 4
        //    r |∞ 5
        //    d |∞ 6
        //    a |∞ 7
        //    y |∞ 8           *
        //
        // We want to know what the value is at *, and we want to stop as early as possible if it
        // is greater than our threshold.
        //
        // Given the edit distance rules we observe edit distance at any point (i,j) in the matrix will
        // always be greater than or equal to the value in (i-1, j-1).  i.e. the edit distance of
        // any two strings is going to be *at best* equal to the edit distance of those two strings
        // without their final characters.  If their final characters are the same, they'll have the
        // same edit distance.  If they are different, the edit distance will be greater.  Given 
        // that we know the final edit distance is in the lower right, we can discover something 
        // useful in the matrix.
        //
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1 
        //    a |∞ 2
        //    t |∞ 3 `
        //    u |∞ 4   `
        //    r |∞ 5     `
        //    d |∞ 6       `
        //    a |∞ 7         `
        //    y |∞ 8           *
        //
        // The slashes are the "bottom" diagonal leading to the lower right.  The value in the 
        // lower right will be strictly equal to or greater than any value on this diagonal.  
        // Thus, if that value exceeds the threshold, we know we can stop immediately as the 
        // total edit distance must be greater than the threshold.
        //
        // We can use similar logic to avoid even having to examine more of the matrix when we
        // have a threshold. First, consider the same diagonal.
        // 
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1
        //    a |∞ 2
        //    t |∞ 3 `
        //    u |∞ 4   `       x
        //    r |∞ 5     `     |
        //    d |∞ 6       `   |
        //    a |∞ 7         ` |
        //    y |∞ 8           *
        //
        // And then consider a point above that diagonal (indicated by x).  In the example
        // above, the edit distance to * from 'x' will be (x+4).  If, for example, threshold
        // was '2', then it would be impossible for the path from 'x' to provide a good
        // enough edit distance *ever*.   Similarly:
        //
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1
        //    a |∞ 2
        //    t |∞ 3 `
        //    u |∞ 4   `
        //    r |∞ 5     `
        //    d |∞ 6       `
        //    a |∞ 7         `
        //    y |∞ 8     y - - *
        //
        // Here we see that the final edit distance will be "y+3".  Again, if the edit 
        // distance threshold is less than 3, then no path from y will provide a good
        // enough edit distance.
        //
        // So, if we had an edit distance threshold of 3, then the range around that
        // bottom diagonal that we should consider checking is:
        //
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1 | |
        //    a |∞ 2 | | |
        //    t |∞ 3 ` | | |
        //    u |∞ 4 - ` | | |
        //    r |∞ 5 - - ` | | |
        //    d |∞ 6 - - - ` | |
        //    a |∞ 7   - - - ` |
        //    y |∞ 8     - - - *
        //
        // Now, also consider that it will take a minimum of targetLength-sourceLength edits 
        // just to move to the lower diagonal from the upper diagonal.  That leaves
        // 'threshold - (targetLength - sourceLength)' edits remaining.  In this example, that
        // means '3 - (8 - 6)' = 1.  Because of this our lower diagonal offset is capped at:
        //
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1 | |
        //    a |∞ 2 | | |
        //    t |∞ 3 ` | | |
        //    u |∞ 4 - ` | | |
        //    r |∞ 5   - ` | | |
        //    d |∞ 6     - ` | |
        //    a |∞ 7       - ` |
        //    y |∞ 8         - *
        //
        // If we mark the upper diagonal appropriately we see the matrix as:
        //
        //           s u n d a y
        //      ----------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6
        //    s |∞ 1 ` |
        //    a |∞ 2   ` |
        //    t |∞ 3 `   ` |
        //    u |∞ 4 - `   ` |
        //    r |∞ 5   - `   ` |
        //    d |∞ 6     - `   `
        //    a |∞ 7       - `  
        //    y |∞ 8         - *
        //
        // Or, effectively, we only need to examine 'threshold - (targetLength - sourceLength)' 
        // above and below the diagonals.
        //
        // In practice, when a threshold is provided it is normally capped at '2'.  Given that,
        // the most around the diagonal we'll ever have to check is +/- 2 elements.  i.e. with
        // strings of length 10 we'd only check:
        // 
        //           a b c d e f g h i j
        //      ------------------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6 7 8 9 10
        //    m |∞ 1 * * *
        //    n |∞ 2 * * * *
        //    o |∞ 3 * * * * *
        //    p |∞ 4   * * * * *
        //    q |∞ 5     * * * * *
        //    r |∞ 6       * * * * *
        //    s |∞ 7         * * * * *
        //    t |∞ 8           * * * * *
        //    u |∞ 9             * * * *
        //    v |∞10               * * *
        //
        // or 10+18+16=44.  Or only 44%. if our threshold is two and our strings differ by length 
        // 2 then we have:
        //
        //           a b c d e f g h
        //      --------------------
        //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
        //      |∞ 0 1 2 3 4 5 6 7 8
        //    m |∞ 1 *
        //    n |∞ 2 * *
        //    o |∞ 3 * * *
        //    p |∞ 4   * * *
        //    q |∞ 5     * * *
        //    r |∞ 6       * * *
        //    s |∞ 7         * * *
        //    t |∞ 8           * * *
        //    u |∞ 9             * * 
        //    v |∞10               *  
        //
        // Then we examine 8+8+8=24 out of 80, or only 30% of the matrix.  As the strings
        // get larger, the savings increase as well.
 
        // --------------------------------------------------------------------------------
 
        // The highest cost it can be to convert a source to target is targetLength.  i.e.
        // changing all the characters in source to target (which would be be 'sourceLength'
        // changes), and then adding all the missing characters in 'target' (which is
        // 'targetLength' - 'sourceLength' changes).  Combined that's 'targetLength'.  
        //
        // So we can just cap our threshold here.  This makes some of the walking code 
        // below simpler.
        threshold = Math.Min(threshold, targetLength);
 
        var offset = threshold - minimumEditCount;
        Debug.Assert(offset >= 0);
 
        var matrix = GetMatrix(sourceLength + 2, targetLength + 2);
 
        var characterToLastSeenIndex_inSource = t_lastSeenIndexPool.Value!;
        Array.Clear(characterToLastSeenIndex_inSource, 0, LastSeenIndexLength);
 
        for (var i = 1; i <= sourceLength; i++)
        {
            var lastMatchIndex_inTarget = 0;
            var sourceChar = source[i - 1];
 
            // Determinethe portion of the column we actually want to examine.
            var jStart = Math.Max(1, i - offset);
            var jEnd = Math.Min(targetLength, i + minimumEditCount + offset);
 
            // If we're examining only a subportion of the column, then we need to make sure
            // that the values outside that range are set to Infinity.  That way we don't
            // consider them when we look through edit paths from above (for this column) or 
            // from the left (for the next column).
            if (jStart > 1)
            {
                matrix[i + 1, jStart] = Infinity;
            }
 
            if (jEnd < targetLength)
            {
                matrix[i + 1, jEnd + 2] = Infinity;
            }
 
            for (var j = jStart; j <= jEnd; j++)
            {
                var targetChar = target[j - 1];
 
                var i1 = targetChar < LastSeenIndexLength ? characterToLastSeenIndex_inSource[targetChar] : 0;
                var j1 = lastMatchIndex_inTarget;
 
                var matched = sourceChar == targetChar;
                if (matched)
                {
                    lastMatchIndex_inTarget = j;
                }
 
                matrix[i + 1, j + 1] = Min(
                    matrix[i, j] + (matched ? 0 : 1),
                    matrix[i + 1, j] + 1,
                    matrix[i, j + 1] + 1,
                    matrix[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
            }
 
            if (sourceChar < LastSeenIndexLength)
            {
                characterToLastSeenIndex_inSource[sourceChar] = i;
            }
 
            // Recall that minimumEditCount is simply the difference in length of our two
            // strings.  So matrix[i+1,i+1] is the cost for the upper-left diagonal of the
            // matrix.  matrix[i+1,i+1+minimumEditCount] is the cost for the lower right diagonal.
            // Here we are simply getting the lowest cost edit of hese two substrings so far.
            // If this lowest cost edit is greater than our threshold, then there is no need 
            // to proceed.
            if (matrix[i + 1, i + minimumEditCount + 1] > threshold)
            {
                return BeyondThreshold;
            }
        }
 
        return matrix[sourceLength + 1, targetLength + 1];
    }
 
    private static string ToString(int[,] matrix, int width, int height)
    {
        var sb = new StringBuilder();
        for (var j = 0; j < height; j++)
        {
            for (var i = 0; i < width; i++)
            {
                var v = matrix[i + 2, j + 2];
                sb.Append((v == Infinity ? "∞" : v.ToString()) + " ");
            }
 
            sb.AppendLine();
        }
 
        return sb.ToString().Trim();
    }
 
    private static int GetValue(Dictionary<char, int> da, char c)
        => da.TryGetValue(c, out var value) ? value : 0;
 
    private static int Min(int v1, int v2, int v3, int v4)
    {
        Debug.Assert(v1 >= 0);
        Debug.Assert(v2 >= 0);
        Debug.Assert(v3 >= 0);
        Debug.Assert(v4 >= 0);
 
        var min = v1;
        if (v2 < min)
        {
            min = v2;
        }
 
        if (v3 < min)
        {
            min = v3;
        }
 
        if (v4 < min)
        {
            min = v4;
        }
 
        Debug.Assert(min >= 0);
        return min;
    }
 
    private static void SetValue(int[,] matrix, int i, int j, int val)
    {
        // Matrix is -1 based, so we add 1 to both i and j to make it
        // possible to index into the actual storage.
        matrix[i + 1, j + 1] = val;
    }
}
File: src\Workspaces\SharedUtilitiesAndExtensions\Compiler\Core\Utilities\EditDistance.cs	Web Access
Project: src\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)