File: Differencing\Match.cs
Web Access
Project: src\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
#nullable disable
 
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.Linq;
 
namespace Microsoft.CodeAnalysis.Differencing;
 
public sealed partial class Match<TNode>
{
    private const double ExactMatchDistance = 0.0;
    private const double EpsilonDistance = 0.00001;
    private const double MatchingDistance1 = 0.25;
    private const double MatchingDistance2 = 0.5;
    private const double MatchingDistance3 = 0.75;
    private const double MaxDistance = 1.0;
    private readonly Dictionary<TNode, TNode> _oneToTwo = [];
    private readonly Dictionary<TNode, TNode> _twoToOne = [];
 
    internal Match(TNode root1, TNode root2, TreeComparer<TNode> comparer, IEnumerable<KeyValuePair<TNode, TNode>> knownMatches)
    {
        OldRoot = root1;
        NewRoot = root2;
        Comparer = comparer;
 
        var labelCount = comparer.LabelCount;
        CategorizeNodesByLabels(comparer, root1, labelCount, out var nodes1, out _);
        CategorizeNodesByLabels(comparer, root2, labelCount, out var nodes2, out _);
 
        // Root nodes always match. Add them before adding known matches to make sure we always have root mapping.
        TryAdd(root1, root2);
 
        if (knownMatches != null)
        {
            foreach (var knownMatch in knownMatches)
            {
                if (comparer.GetLabel(knownMatch.Key) != comparer.GetLabel(knownMatch.Value))
                {
                    throw new ArgumentException(string.Format(WorkspacesResources.Matching_nodes_0_and_1_must_have_the_same_label, knownMatch.Key, knownMatch.Value), nameof(knownMatches));
                }
 
                if (!comparer.TreesEqual(knownMatch.Key, root1))
                {
                    throw new ArgumentException(string.Format(WorkspacesResources.Node_0_must_be_contained_in_the_old_tree, knownMatch.Key), nameof(knownMatches));
                }
 
                if (!comparer.TreesEqual(knownMatch.Value, root2))
                {
                    throw new ArgumentException(string.Format(WorkspacesResources.Node_0_must_be_contained_in_the_new_tree, knownMatch.Value), nameof(knownMatches));
                }
 
                // skip pairs whose key or value is already mapped:
                TryAdd(knownMatch.Key, knownMatch.Value);
            }
        }
 
        ComputeMatch(nodes1, nodes2);
    }
 
    private static void CategorizeNodesByLabels(
        TreeComparer<TNode> comparer,
        TNode root,
        int labelCount,
        out List<TNode>[] nodes,
        out int totalCount)
    {
        nodes = new List<TNode>[labelCount];
        var count = 0;
 
        // It is important that we add the nodes in depth-first prefix order.
        // This order ensures that a node of a certain kind can have a parent of the same kind 
        // and we can still use tied-to-parent for that kind. That's because the parent will always
        // be processed earlier than the child due to depth-first prefix ordering.
        foreach (var node in comparer.GetDescendants(root))
        {
            var label = comparer.GetLabel(node);
            if (label < 0 || label >= labelCount)
            {
                throw new InvalidOperationException(string.Format(WorkspacesResources.Label_for_node_0_is_invalid_it_must_be_within_bracket_0_1, node, labelCount));
            }
 
            var list = nodes[label];
            if (list == null)
            {
                nodes[label] = list = [];
            }
 
            list.Add(node);
 
            count++;
        }
 
        totalCount = count;
    }
 
    private void ComputeMatch(List<TNode>[] nodes1, List<TNode>[] nodes2)
    {
        Debug.Assert(nodes1.Length == nodes2.Length);
 
        // --- The original FastMatch algorithm ---
        // 
        // For each leaf label l, and then for each internal node label l do:
        // a) S1 := chain T1(l)
        // b) S2 := chain T2(l)
        // c) lcs := LCS(S1, S2, Equal)
        // d) For each pair of nodes (x,y) in lcs add (x,y) to M.
        // e) Pair unmatched nodes with label l as in Algorithm Match, adding matches to M:
        //    For each unmatched node x in T1, if there is an unmatched node y in T2 such that equal(x,y) 
        //    then add (x,y) to M.
        //
        // equal(x,y) is defined as follows:
        //   x, y are leafs => equal(x,y) := label(x) == label(y) && compare(value(x), value(y)) <= f
        //   x, y are nodes => equal(x,y) := label(x) == label(y) && |common(x,y)| / max(|x|, |y|) > t 
        // where f, t are constants.
        //
        // --- Actual implementation ---
        //
        // We also categorize nodes by their labels, but then we proceed differently:
        // 
        // 1) A label may be marked "tied to parent". Let x, y have both label l and l is "tied to parent".
        //    Then (x,y) can be in M only if (parent(x), parent(y)) in M.
        //    Thus we require labels of children tied to a parent to be preceded by all their possible parent labels.
        //
        // 2) Rather than defining function equal in terms of constants f and t, which are hard to get right,
        //    we try to match multiple times with different threshold for node distance.
        //    The comparer defines the distance [0..1] between two nodes and it can do so by analyzing 
        //    the node structure and value. The comparer can tune the distance specifically for each node kind.
        //    We first try to match nodes of the same labels to the exactly matching or almost matching counterparts.
        //    The we keep increasing the threshold and keep adding matches. 
 
        for (var l = 0; l < nodes1.Length; l++)
        {
            if (nodes1[l] != null && nodes2[l] != null)
            {
                ComputeMatchForLabel(l, nodes1[l], nodes2[l]);
            }
        }
    }
 
    private void ComputeMatchForLabel(int label, List<TNode> s1, List<TNode> s2)
    {
        var tiedToAncestor = Comparer.TiedToAncestor(label);
 
        ComputeMatchForLabel(s1, s2, tiedToAncestor, EpsilonDistance);     // almost exact match
        ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance1);   // ok match
        ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance2);   // ok match
        ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance3);   // ok match
        ComputeMatchForLabel(s1, s2, tiedToAncestor, MaxDistance);         // any match
    }
 
    private void ComputeMatchForLabel(List<TNode> s1, List<TNode> s2, int tiedToAncestor, double maxAcceptableDistance)
    {
        // Obviously, the algorithm below is O(n^2). However, in the common case, the 2 lists will
        // be sequences that exactly match. The purpose of "firstNonMatch2" is to reduce the complexity
        // to O(n) in this case. Basically, the pointer is the 1st non-matched node in the list of nodes of tree2
        // with the given label. 
        // Whenever we match to firstNonMatch2 we set firstNonMatch2 to the subsequent node.
        // So in the case of totally matching sequences, we process them in O(n) - 
        // both node1 and firstNonMatch2 will be advanced simultaneously.
 
        Debug.Assert(maxAcceptableDistance is >= ExactMatchDistance and <= MaxDistance);
        var count1 = s1.Count;
        var count2 = s2.Count;
        var firstNonMatch2 = 0;
 
        for (var i1 = 0; i1 < count1; i1++)
        {
            var node1 = s1[i1];
 
            // Skip this guy if it already has a partner
            if (HasPartnerInTree2(node1))
            {
                continue;
            }
 
            // Find node2 that matches node1 the best, i.e. has minimal distance.
 
            var bestDistance = MaxDistance * 2;
            TNode bestMatch = default;
            var matched = false;
            int i2;
            for (i2 = firstNonMatch2; i2 < count2; i2++)
            {
                var node2 = s2[i2];
 
                // Skip this guy if it already has a partner
                if (HasPartnerInTree1(node2))
                {
                    continue;
                }
 
                // this requires parents to be processed before their children:
                if (tiedToAncestor > 0)
                {
                    // TODO (tomat): For nodes tied to their parents, 
                    // consider avoiding matching them to all other nodes of the same label.
                    // Rather we should only match them with their siblings that share the same parent.
 
                    // Check if nodes that are configured to be tied to their ancestor have the respective ancestor matching.
                    // In cases when we compare substrees rooted below both of these ancestors we assume the ancestors are
                    // matching since the roots of the subtrees must match and therefore their ancestors must match as well.
                    // If one node's ancestor is present in the subtree and the other isn't then we are not in the scenario
                    // of comparing subtrees with matching roots and thus we consider the nodes not matching.
 
                    var hasAncestor1 = Comparer.TryGetAncestor(node1, tiedToAncestor, out var ancestor1);
                    var hasAncestor2 = Comparer.TryGetAncestor(node2, tiedToAncestor, out var ancestor2);
                    if (hasAncestor1 != hasAncestor2)
                    {
                        continue;
                    }
 
                    if (hasAncestor1)
                    {
                        // Since CategorizeNodesByLabels added nodes to the s1/s2 lists in depth-first prefix order,
                        // we can also accept equality in the following condition. That's because we find the partner 
                        // of the parent node before we get to finding it for the child node of the same kind.
                        Debug.Assert(Comparer.GetLabel(ancestor1) <= Comparer.GetLabel(node1));
 
                        if (!Contains(ancestor1, ancestor2))
                        {
                            continue;
                        }
                    }
                }
 
                // We know that
                // 1. (node1, node2) not in M
                // 2. Both of their parents are matched to the same parent (or are not matched)
                //
                // Now, we have no other choice than comparing the node "values"
                // and looking for the one with the smaller distance.
 
                var distance = Comparer.GetDistance(node1, node2);
                if (distance < bestDistance)
                {
                    matched = true;
                    bestMatch = node2;
                    bestDistance = distance;
 
                    // We only stop if we've got an exact match. This is to resolve the problem
                    // of entities with identical names(name is often used as the "value" of a
                    // node) but with different "sub-values" (e.g. two locals may have the same name
                    // but different types. Since the type is not part of the value, we don't want
                    // to stop looking for the best match if we don't have an exact match).
                    if (distance == ExactMatchDistance)
                    {
                        break;
                    }
                }
            }
 
            if (matched && bestDistance <= maxAcceptableDistance)
            {
                var added = TryAdd(node1, bestMatch);
 
                // We checked above that node1 doesn't have a partner. 
                // The map is a bijection by construction, so we should be able to add the mapping.
                Debug.Assert(added);
 
                // If we exactly matched to firstNonMatch2 we can advance it.
                if (i2 == firstNonMatch2)
                {
                    firstNonMatch2 = i2 + 1;
                }
 
                if (firstNonMatch2 == count2)
                {
                    return;
                }
            }
        }
    }
 
    internal bool TryAdd(TNode node1, TNode node2)
    {
        Debug.Assert(Comparer.TreesEqual(node1, OldRoot));
        Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
 
        if (_oneToTwo.ContainsKey(node1) || _twoToOne.ContainsKey(node2))
        {
            return false;
        }
 
        _oneToTwo.Add(node1, node2);
        _twoToOne.Add(node2, node1);
        return true;
    }
 
    internal bool TryGetPartnerInTree1(TNode node2, out TNode partner1)
    {
        var result = _twoToOne.TryGetValue(node2, out partner1);
        Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
        Debug.Assert(!result || Comparer.TreesEqual(partner1, OldRoot));
        return result;
    }
 
    internal bool HasPartnerInTree1(TNode node2)
    {
        Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
        return _twoToOne.ContainsKey(node2);
    }
 
    internal bool TryGetPartnerInTree2(TNode node1, out TNode partner2)
    {
        var result = _oneToTwo.TryGetValue(node1, out partner2);
        Debug.Assert(Comparer.TreesEqual(node1, OldRoot));
        Debug.Assert(!result || Comparer.TreesEqual(partner2, NewRoot));
        return result;
    }
 
    internal bool HasPartnerInTree2(TNode node1)
    {
        Debug.Assert(Comparer.TreesEqual(node1, OldRoot));
        return _oneToTwo.ContainsKey(node1);
    }
 
    internal bool Contains(TNode node1, TNode node2)
    {
        Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
        return TryGetPartnerInTree2(node1, out var partner2) && node2.Equals(partner2);
    }
 
    public TreeComparer<TNode> Comparer { get; }
 
    public TNode OldRoot { get; }
 
    public TNode NewRoot { get; }
 
    public IReadOnlyDictionary<TNode, TNode> Matches
    {
        get
        {
            return new ReadOnlyDictionary<TNode, TNode>(_oneToTwo);
        }
    }
 
    public IReadOnlyDictionary<TNode, TNode> ReverseMatches
    {
        get
        {
            return new ReadOnlyDictionary<TNode, TNode>(_twoToOne);
        }
    }
 
    public bool TryGetNewNode(TNode oldNode, out TNode newNode)
        => _oneToTwo.TryGetValue(oldNode, out newNode);
 
    public bool TryGetOldNode(TNode newNode, out TNode oldNode)
        => _twoToOne.TryGetValue(newNode, out oldNode);
 
    /// <summary>
    /// Returns an edit script (a sequence of edits) that transform <see cref="OldRoot"/> subtree 
    /// to <see cref="NewRoot"/> subtree.
    /// </summary>
    public EditScript<TNode> GetTreeEdits()
        => new(this);
 
    /// <summary>
    /// Returns an edit script (a sequence of edits) that transform a sequence of nodes <paramref name="oldNodes"/>
    /// to a sequence of nodes <paramref name="newNodes"/>. 
    /// </summary>
    /// <exception cref="ArgumentNullException"><paramref name="oldNodes"/> or <paramref name="newNodes"/> is a null reference.</exception>
    public IEnumerable<Edit<TNode>> GetSequenceEdits(IEnumerable<TNode> oldNodes, IEnumerable<TNode> newNodes)
    {
        if (oldNodes == null)
        {
            throw new ArgumentNullException(nameof(oldNodes));
        }
 
        if (newNodes == null)
        {
            throw new ArgumentNullException(nameof(newNodes));
        }
 
        var oldList = (oldNodes as IReadOnlyList<TNode>) ?? oldNodes.ToList();
        var newList = (newNodes as IReadOnlyList<TNode>) ?? newNodes.ToList();
 
        return new LongestCommonSubsequence(this).GetEdits(oldList, newList);
    }
}