|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
#nullable disable
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.Linq;
namespace Microsoft.CodeAnalysis.Differencing;
public sealed partial class Match<TNode>
{
private const double ExactMatchDistance = 0.0;
private const double EpsilonDistance = 0.00001;
private const double MatchingDistance1 = 0.25;
private const double MatchingDistance2 = 0.5;
private const double MatchingDistance3 = 0.75;
private const double MaxDistance = 1.0;
private readonly Dictionary<TNode, TNode> _oneToTwo = [];
private readonly Dictionary<TNode, TNode> _twoToOne = [];
internal Match(TNode root1, TNode root2, TreeComparer<TNode> comparer, IEnumerable<KeyValuePair<TNode, TNode>> knownMatches)
{
OldRoot = root1;
NewRoot = root2;
Comparer = comparer;
var labelCount = comparer.LabelCount;
CategorizeNodesByLabels(comparer, root1, labelCount, out var nodes1, out _);
CategorizeNodesByLabels(comparer, root2, labelCount, out var nodes2, out _);
// Root nodes always match. Add them before adding known matches to make sure we always have root mapping.
TryAdd(root1, root2);
if (knownMatches != null)
{
foreach (var knownMatch in knownMatches)
{
if (comparer.GetLabel(knownMatch.Key) != comparer.GetLabel(knownMatch.Value))
{
throw new ArgumentException(string.Format(WorkspacesResources.Matching_nodes_0_and_1_must_have_the_same_label, knownMatch.Key, knownMatch.Value), nameof(knownMatches));
}
if (!comparer.TreesEqual(knownMatch.Key, root1))
{
throw new ArgumentException(string.Format(WorkspacesResources.Node_0_must_be_contained_in_the_old_tree, knownMatch.Key), nameof(knownMatches));
}
if (!comparer.TreesEqual(knownMatch.Value, root2))
{
throw new ArgumentException(string.Format(WorkspacesResources.Node_0_must_be_contained_in_the_new_tree, knownMatch.Value), nameof(knownMatches));
}
// skip pairs whose key or value is already mapped:
TryAdd(knownMatch.Key, knownMatch.Value);
}
}
ComputeMatch(nodes1, nodes2);
}
private static void CategorizeNodesByLabels(
TreeComparer<TNode> comparer,
TNode root,
int labelCount,
out List<TNode>[] nodes,
out int totalCount)
{
nodes = new List<TNode>[labelCount];
var count = 0;
// It is important that we add the nodes in depth-first prefix order.
// This order ensures that a node of a certain kind can have a parent of the same kind
// and we can still use tied-to-parent for that kind. That's because the parent will always
// be processed earlier than the child due to depth-first prefix ordering.
foreach (var node in comparer.GetDescendants(root))
{
var label = comparer.GetLabel(node);
if (label < 0 || label >= labelCount)
{
throw new InvalidOperationException(string.Format(WorkspacesResources.Label_for_node_0_is_invalid_it_must_be_within_bracket_0_1, node, labelCount));
}
var list = nodes[label];
if (list == null)
{
nodes[label] = list = [];
}
list.Add(node);
count++;
}
totalCount = count;
}
private void ComputeMatch(List<TNode>[] nodes1, List<TNode>[] nodes2)
{
Debug.Assert(nodes1.Length == nodes2.Length);
// --- The original FastMatch algorithm ---
//
// For each leaf label l, and then for each internal node label l do:
// a) S1 := chain T1(l)
// b) S2 := chain T2(l)
// c) lcs := LCS(S1, S2, Equal)
// d) For each pair of nodes (x,y) in lcs add (x,y) to M.
// e) Pair unmatched nodes with label l as in Algorithm Match, adding matches to M:
// For each unmatched node x in T1, if there is an unmatched node y in T2 such that equal(x,y)
// then add (x,y) to M.
//
// equal(x,y) is defined as follows:
// x, y are leafs => equal(x,y) := label(x) == label(y) && compare(value(x), value(y)) <= f
// x, y are nodes => equal(x,y) := label(x) == label(y) && |common(x,y)| / max(|x|, |y|) > t
// where f, t are constants.
//
// --- Actual implementation ---
//
// We also categorize nodes by their labels, but then we proceed differently:
//
// 1) A label may be marked "tied to parent". Let x, y have both label l and l is "tied to parent".
// Then (x,y) can be in M only if (parent(x), parent(y)) in M.
// Thus we require labels of children tied to a parent to be preceded by all their possible parent labels.
//
// 2) Rather than defining function equal in terms of constants f and t, which are hard to get right,
// we try to match multiple times with different threshold for node distance.
// The comparer defines the distance [0..1] between two nodes and it can do so by analyzing
// the node structure and value. The comparer can tune the distance specifically for each node kind.
// We first try to match nodes of the same labels to the exactly matching or almost matching counterparts.
// The we keep increasing the threshold and keep adding matches.
for (var l = 0; l < nodes1.Length; l++)
{
if (nodes1[l] != null && nodes2[l] != null)
{
ComputeMatchForLabel(l, nodes1[l], nodes2[l]);
}
}
}
private void ComputeMatchForLabel(int label, List<TNode> s1, List<TNode> s2)
{
var tiedToAncestor = Comparer.TiedToAncestor(label);
ComputeMatchForLabel(s1, s2, tiedToAncestor, EpsilonDistance); // almost exact match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance1); // ok match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance2); // ok match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance3); // ok match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MaxDistance); // any match
}
private void ComputeMatchForLabel(List<TNode> s1, List<TNode> s2, int tiedToAncestor, double maxAcceptableDistance)
{
// Obviously, the algorithm below is O(n^2). However, in the common case, the 2 lists will
// be sequences that exactly match. The purpose of "firstNonMatch2" is to reduce the complexity
// to O(n) in this case. Basically, the pointer is the 1st non-matched node in the list of nodes of tree2
// with the given label.
// Whenever we match to firstNonMatch2 we set firstNonMatch2 to the subsequent node.
// So in the case of totally matching sequences, we process them in O(n) -
// both node1 and firstNonMatch2 will be advanced simultaneously.
Debug.Assert(maxAcceptableDistance is >= ExactMatchDistance and <= MaxDistance);
var count1 = s1.Count;
var count2 = s2.Count;
var firstNonMatch2 = 0;
for (var i1 = 0; i1 < count1; i1++)
{
var node1 = s1[i1];
// Skip this guy if it already has a partner
if (HasPartnerInTree2(node1))
{
continue;
}
// Find node2 that matches node1 the best, i.e. has minimal distance.
var bestDistance = MaxDistance * 2;
TNode bestMatch = default;
var matched = false;
int i2;
for (i2 = firstNonMatch2; i2 < count2; i2++)
{
var node2 = s2[i2];
// Skip this guy if it already has a partner
if (HasPartnerInTree1(node2))
{
continue;
}
// this requires parents to be processed before their children:
if (tiedToAncestor > 0)
{
// TODO (tomat): For nodes tied to their parents,
// consider avoiding matching them to all other nodes of the same label.
// Rather we should only match them with their siblings that share the same parent.
// Check if nodes that are configured to be tied to their ancestor have the respective ancestor matching.
// In cases when we compare substrees rooted below both of these ancestors we assume the ancestors are
// matching since the roots of the subtrees must match and therefore their ancestors must match as well.
// If one node's ancestor is present in the subtree and the other isn't then we are not in the scenario
// of comparing subtrees with matching roots and thus we consider the nodes not matching.
var hasAncestor1 = Comparer.TryGetAncestor(node1, tiedToAncestor, out var ancestor1);
var hasAncestor2 = Comparer.TryGetAncestor(node2, tiedToAncestor, out var ancestor2);
if (hasAncestor1 != hasAncestor2)
{
continue;
}
if (hasAncestor1)
{
// Since CategorizeNodesByLabels added nodes to the s1/s2 lists in depth-first prefix order,
// we can also accept equality in the following condition. That's because we find the partner
// of the parent node before we get to finding it for the child node of the same kind.
Debug.Assert(Comparer.GetLabel(ancestor1) <= Comparer.GetLabel(node1));
if (!Contains(ancestor1, ancestor2))
{
continue;
}
}
}
// We know that
// 1. (node1, node2) not in M
// 2. Both of their parents are matched to the same parent (or are not matched)
//
// Now, we have no other choice than comparing the node "values"
// and looking for the one with the smaller distance.
var distance = Comparer.GetDistance(node1, node2);
if (distance < bestDistance)
{
matched = true;
bestMatch = node2;
bestDistance = distance;
// We only stop if we've got an exact match. This is to resolve the problem
// of entities with identical names(name is often used as the "value" of a
// node) but with different "sub-values" (e.g. two locals may have the same name
// but different types. Since the type is not part of the value, we don't want
// to stop looking for the best match if we don't have an exact match).
if (distance == ExactMatchDistance)
{
break;
}
}
}
if (matched && bestDistance <= maxAcceptableDistance)
{
var added = TryAdd(node1, bestMatch);
// We checked above that node1 doesn't have a partner.
// The map is a bijection by construction, so we should be able to add the mapping.
Debug.Assert(added);
// If we exactly matched to firstNonMatch2 we can advance it.
if (i2 == firstNonMatch2)
{
firstNonMatch2 = i2 + 1;
}
if (firstNonMatch2 == count2)
{
return;
}
}
}
}
internal bool TryAdd(TNode node1, TNode node2)
{
Debug.Assert(Comparer.TreesEqual(node1, OldRoot));
Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
if (_oneToTwo.ContainsKey(node1) || _twoToOne.ContainsKey(node2))
{
return false;
}
_oneToTwo.Add(node1, node2);
_twoToOne.Add(node2, node1);
return true;
}
internal bool TryGetPartnerInTree1(TNode node2, out TNode partner1)
{
var result = _twoToOne.TryGetValue(node2, out partner1);
Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
Debug.Assert(!result || Comparer.TreesEqual(partner1, OldRoot));
return result;
}
internal bool HasPartnerInTree1(TNode node2)
{
Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
return _twoToOne.ContainsKey(node2);
}
internal bool TryGetPartnerInTree2(TNode node1, out TNode partner2)
{
var result = _oneToTwo.TryGetValue(node1, out partner2);
Debug.Assert(Comparer.TreesEqual(node1, OldRoot));
Debug.Assert(!result || Comparer.TreesEqual(partner2, NewRoot));
return result;
}
internal bool HasPartnerInTree2(TNode node1)
{
Debug.Assert(Comparer.TreesEqual(node1, OldRoot));
return _oneToTwo.ContainsKey(node1);
}
internal bool Contains(TNode node1, TNode node2)
{
Debug.Assert(Comparer.TreesEqual(node2, NewRoot));
return TryGetPartnerInTree2(node1, out var partner2) && node2.Equals(partner2);
}
public TreeComparer<TNode> Comparer { get; }
public TNode OldRoot { get; }
public TNode NewRoot { get; }
public IReadOnlyDictionary<TNode, TNode> Matches
{
get
{
return new ReadOnlyDictionary<TNode, TNode>(_oneToTwo);
}
}
public IReadOnlyDictionary<TNode, TNode> ReverseMatches
{
get
{
return new ReadOnlyDictionary<TNode, TNode>(_twoToOne);
}
}
public bool TryGetNewNode(TNode oldNode, out TNode newNode)
=> _oneToTwo.TryGetValue(oldNode, out newNode);
public bool TryGetOldNode(TNode newNode, out TNode oldNode)
=> _twoToOne.TryGetValue(newNode, out oldNode);
/// <summary>
/// Returns an edit script (a sequence of edits) that transform <see cref="OldRoot"/> subtree
/// to <see cref="NewRoot"/> subtree.
/// </summary>
public EditScript<TNode> GetTreeEdits()
=> new(this);
/// <summary>
/// Returns an edit script (a sequence of edits) that transform a sequence of nodes <paramref name="oldNodes"/>
/// to a sequence of nodes <paramref name="newNodes"/>.
/// </summary>
/// <exception cref="ArgumentNullException"><paramref name="oldNodes"/> or <paramref name="newNodes"/> is a null reference.</exception>
public IEnumerable<Edit<TNode>> GetSequenceEdits(IEnumerable<TNode> oldNodes, IEnumerable<TNode> newNodes)
{
if (oldNodes == null)
{
throw new ArgumentNullException(nameof(oldNodes));
}
if (newNodes == null)
{
throw new ArgumentNullException(nameof(newNodes));
}
var oldList = (oldNodes as IReadOnlyList<TNode>) ?? oldNodes.ToList();
var newList = (newNodes as IReadOnlyList<TNode>) ?? newNodes.ToList();
return new LongestCommonSubsequence(this).GetEdits(oldList, newList);
}
}
|