// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; using System.Diagnostics; namespace Roslyn.Utilities; internal struct WordSimilarityChecker : IDisposable { private readonly struct CacheResult(string candidate, bool areSimilar, double similarityWeight) { public readonly string CandidateText = candidate; public readonly bool AreSimilar = areSimilar; public readonly double SimilarityWeight = similarityWeight; } // Cache the result of the last call to AreSimilar. We'll often be called with the same // value multiple times in a row, so we can avoid expensive computation by returning the // same value immediately. private CacheResult _lastAreSimilarResult; private readonly string _source; private readonly EditDistance _editDistance; private readonly int _threshold; /// <summary> /// Whether or words should be considered similar if one is contained within the other /// (regardless of edit distance). For example if is true then IService would be considered /// similar to IServiceFactory despite the edit distance being quite high at 7. /// </summary> private readonly bool _substringsAreSimilar; public readonly bool IsDefault => _source is null; public WordSimilarityChecker(string text, bool substringsAreSimilar) { _source = text ?? throw new ArgumentNullException(nameof(text)); _threshold = GetThreshold(_source); _editDistance = new EditDistance(text); _substringsAreSimilar = substringsAreSimilar; } public readonly void Dispose() { if (this.IsDefault) return; _editDistance.Dispose(); } public static bool AreSimilar(string originalText, string candidateText) => AreSimilar(originalText, candidateText, substringsAreSimilar: false); public static bool AreSimilar(string originalText, string candidateText, bool substringsAreSimilar) => AreSimilar(originalText, candidateText, substringsAreSimilar, out _); public static bool AreSimilar(string originalText, string candidateText, out double similarityWeight) { return AreSimilar( originalText, candidateText, substringsAreSimilar: false, similarityWeight: out similarityWeight); } /// <summary> /// Returns true if 'originalText' and 'candidateText' are likely a misspelling of each other. /// Returns false otherwise. If it is a likely misspelling a similarityWeight is provided /// to help rank the match. Lower costs mean it was a better match. /// </summary> public static bool AreSimilar(string originalText, string candidateText, bool substringsAreSimilar, out double similarityWeight) { using var checker = new WordSimilarityChecker(originalText, substringsAreSimilar); var result = checker.AreSimilar(candidateText, out similarityWeight); return result; } internal static int GetThreshold(string value) => value.Length <= 4 ? 1 : 2; public bool AreSimilar(string candidateText) => AreSimilar(candidateText, out _); public bool AreSimilar(string candidateText, out double similarityWeight) { if (_source.Length < 3) { // If we're comparing strings that are too short, we'll find // far too many spurious hits. Don't even bother in this case. similarityWeight = double.MaxValue; return false; } if (_lastAreSimilarResult.CandidateText == candidateText) { similarityWeight = _lastAreSimilarResult.SimilarityWeight; return _lastAreSimilarResult.AreSimilar; } var result = AreSimilarWorker(candidateText, out similarityWeight); _lastAreSimilarResult = new CacheResult(candidateText, result, similarityWeight); return result; } private bool AreSimilarWorker(string candidateText, out double similarityWeight) { similarityWeight = double.MaxValue; // If the two strings differ by more characters than the cost threshold, then there's // no point in even computing the edit distance as it would necessarily take at least // that many additions/deletions. if (Math.Abs(_source.Length - candidateText.Length) <= _threshold) { similarityWeight = _editDistance.GetEditDistance(candidateText, _threshold); } if (similarityWeight > _threshold) { // it had a high cost. However, the string the user typed was contained // in the string we're currently looking at. That's enough to consider it // although we place it just at the threshold (i.e. it's worse than all // other matches). if (_substringsAreSimilar && candidateText.IndexOf(_source, StringComparison.OrdinalIgnoreCase) >= 0) { similarityWeight = _threshold; } else { return false; } } Debug.Assert(similarityWeight <= _threshold); similarityWeight += Penalty(candidateText, _source); return true; } private static double Penalty(string candidateText, string originalText) { var lengthDifference = Math.Abs(originalText.Length - candidateText.Length); if (lengthDifference != 0) { // For all items of the same edit cost, we penalize those that are // much longer than the original text versus those that are only // a little longer. // // Note: even with this penalty, all matches of cost 'X' will all still // cost less than matches of cost 'X + 1'. i.e. the penalty is in the // range [0, 1) and only serves to order matches of the same cost. // // Here's the relation of the first few values of length diff and penalty: // LengthDiff -> Penalty // 1 -> .5 // 2 -> .66 // 3 -> .75 // 4 -> .8 // And so on and so forth. var penalty = 1.0 - (1.0 / (lengthDifference + 1)); return penalty; } return 0; } public readonly bool LastCacheResultIs(bool areSimilar, string candidateText) => _lastAreSimilarResult.AreSimilar == areSimilar && _lastAreSimilarResult.CandidateText == candidateText; } |