|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using Microsoft.CodeAnalysis.Shared;
using Microsoft.CodeAnalysis.Shared.Collections;
using Microsoft.CodeAnalysis.Shared.Utilities;
using Microsoft.CodeAnalysis.Text;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.PatternMatching;
/// <summary>
/// The pattern matcher is not thread-safe. Do not use the pattern matcher across mutiple threads concurrently. It
/// also keeps an internal cache of data for speeding up operations. As such, it should be disposed when done to
/// release the cached data back. and release the matcher appropriately once you no longer need it. Also, while the
/// pattern matcher is culture aware, it uses the culture specified in the constructor.
/// </summary>
internal abstract partial class PatternMatcher : IDisposable
{
private static readonly char[] s_dotCharacterArray = ['.'];
public const int NoBonus = 0;
public const int CamelCaseContiguousBonus = 1;
public const int CamelCaseMatchesFromStartBonus = 2;
public const int CamelCaseMaxWeight = CamelCaseContiguousBonus + CamelCaseMatchesFromStartBonus;
private readonly bool _includeMatchedSpans;
private readonly bool _allowFuzzyMatching;
// PERF: Cache the culture's compareInfo to avoid the overhead of asking for them repeatedly in inner loops
private readonly CompareInfo _compareInfo;
private readonly TextInfo _textInfo;
private bool _invalidPattern;
/// <summary>
/// Construct a new PatternMatcher using the specified culture.
/// </summary>
/// <param name="culture">The culture to use for string searching and comparison.</param>
/// <param name="includeMatchedSpans">Whether or not the matching parts of the candidate should be supplied in results.</param>
/// <param name="allowFuzzyMatching">Whether or not close matches should count as matches.</param>
protected PatternMatcher(
bool includeMatchedSpans,
CultureInfo? culture,
bool allowFuzzyMatching = false)
{
culture ??= CultureInfo.CurrentCulture;
_compareInfo = culture.CompareInfo;
_textInfo = culture.TextInfo;
_includeMatchedSpans = includeMatchedSpans;
_allowFuzzyMatching = allowFuzzyMatching;
}
public virtual void Dispose()
{
}
public static PatternMatcher CreatePatternMatcher(
string pattern,
CultureInfo? culture = null,
bool includeMatchedSpans = false,
bool allowFuzzyMatching = false)
{
return new SimplePatternMatcher(pattern, culture, includeMatchedSpans, allowFuzzyMatching);
}
public static PatternMatcher CreateContainerPatternMatcher(
string[] patternParts,
char[] containerSplitCharacters,
bool includeMatchedSpans = false,
CultureInfo? culture = null,
bool allowFuzzyMatching = false)
{
return new ContainerPatternMatcher(
patternParts, containerSplitCharacters, includeMatchedSpans, culture, allowFuzzyMatching);
}
public static PatternMatcher CreateDotSeparatedContainerMatcher(
string pattern,
bool includeMatchedSpans = false,
CultureInfo? culture = null,
bool allowFuzzyMatching = false)
{
return CreateContainerPatternMatcher(
pattern.Split(s_dotCharacterArray, StringSplitOptions.RemoveEmptyEntries),
s_dotCharacterArray, includeMatchedSpans, culture, allowFuzzyMatching);
}
internal static (string name, string? containerOpt) GetNameAndContainer(string pattern)
{
var dotIndex = pattern.LastIndexOf('.');
var containsDots = dotIndex >= 0;
return containsDots
? (name: pattern[(dotIndex + 1)..], containerOpt: pattern[..dotIndex])
: (name: pattern, containerOpt: null);
}
public abstract bool AddMatches(string? candidate, ref TemporaryArray<PatternMatch> matches);
private bool SkipMatch([NotNullWhen(false)] string? candidate)
=> _invalidPattern || string.IsNullOrWhiteSpace(candidate);
private static bool ContainsUpperCaseLetter(string pattern)
{
// Expansion of "foreach(char ch in pattern)" to avoid a CharEnumerator allocation
for (var i = 0; i < pattern.Length; i++)
{
if (char.IsUpper(pattern[i]))
return true;
}
return false;
}
private static bool ContainsLowerCaseLetter(string pattern)
{
// Expansion of "foreach(char ch in pattern)" to avoid a CharEnumerator allocation
for (var i = 0; i < pattern.Length; i++)
{
if (char.IsLower(pattern[i]))
return true;
}
return false;
}
private PatternMatch? MatchPatternChunk(
string candidate,
ref TextChunk patternChunk,
bool punctuationStripped,
bool fuzzyMatch)
{
return fuzzyMatch
? FuzzyMatchPatternChunk(candidate, ref patternChunk, punctuationStripped)
: NonFuzzyMatchPatternChunk(candidate, patternChunk, punctuationStripped);
}
private static PatternMatch? FuzzyMatchPatternChunk(
string candidate,
ref TextChunk patternChunk,
bool punctuationStripped)
{
Contract.ThrowIfTrue(patternChunk.SimilarityChecker.IsDefault);
if (patternChunk.SimilarityChecker.AreSimilar(candidate))
{
return new PatternMatch(
PatternMatchKind.Fuzzy, punctuationStripped, isCaseSensitive: false, matchedSpan: null);
}
return null;
}
private PatternMatch? NonFuzzyMatchPatternChunk(
string candidate,
in TextChunk patternChunk,
bool punctuationStripped)
{
using var candidateHumps = TemporaryArray<TextSpan>.Empty;
var candidateLength = candidate.Length;
var caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase);
if (caseInsensitiveIndex == 0)
{
// We found the pattern at the start of the candidate. This is either an exact or
// prefix match.
if (patternChunk.Text.Length == candidateLength)
{
// Lengths were the same, this is either a case insensitive or sensitive exact match.
return new PatternMatch(
PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == patternChunk.Text,
matchedSpan: GetMatchedSpan(0, candidateLength));
}
else
{
var isCaseSensitive = _compareInfo.IsPrefix(candidate, patternChunk.Text);
if (!isCaseSensitive && patternChunk.IsUppercase)
{
// The user wrote something all upper case, but happened to match the prefix of the word. For
// example, matching `CR` against both `Create` (a case insensitive prefix match) and `CreateRange`
// (a camel hump match). We want to prefer the latter in this case as the all upper case string
// is a strong signal they wanted camel hump matching here.
PopulateCandidateHumps();
// Note: ensure that we match here case sensitively as well. We only want to take this if their
// camel humps actually matched real word starts.
var match = TryCamelCaseMatch(candidate, patternChunk, punctuationStripped, isLowercase: false, candidateHumps);
if (match is { IsCaseSensitive: true })
return match;
// Deliberately fall through.
}
// Lengths were the same, this is either a case insensitive or sensitive prefix match.
return new PatternMatch(
PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive, matchedSpan: GetMatchedSpan(0, patternChunk.Text.Length));
}
}
var patternIsLowercase = patternChunk.IsLowercase;
if (caseInsensitiveIndex > 0)
{
// We found the pattern somewhere in the candidate. This could be a substring match.
// However, we don't want to be overaggressive in returning just any substring results.
// So do a few more checks to make sure this is a good result.
if (!patternIsLowercase)
{
// Pattern contained uppercase letters. This is a strong indication from the
// user that they expect the same letters to be uppercase in the result. As
// such, only return this if we can find this pattern exactly in the candidate.
var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, startIndex: caseInsensitiveIndex, CompareOptions.None);
if (caseSensitiveIndex > 0)
{
var resultType = char.IsUpper(candidate[caseSensitiveIndex]) ? PatternMatchKind.StartOfWordSubstring : PatternMatchKind.NonLowercaseSubstring;
return new PatternMatch(
resultType,
punctuationStripped,
isCaseSensitive: true,
matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length));
}
}
else
{
// Pattern was all lowercase. This can lead to lots of hits. For example, "bin" in
// "CombineUnits". Instead, we want it to match "Operator[|Bin|]ary" first rather than
// Com[|bin|]eUnits
// If the lowercase search string matched what looks to be the start of a word then that's a
// reasonable hit. This is equivalent to 'bin' matching 'Operator[|Bin|]ary'
if (char.IsUpper(candidate[caseInsensitiveIndex]))
{
return new PatternMatch(PatternMatchKind.StartOfWordSubstring, punctuationStripped,
isCaseSensitive: false,
matchedSpan: GetMatchedSpan(caseInsensitiveIndex, patternChunk.Text.Length));
}
// Now do the more expensive check to see if we're at the start of a word. This is to catch
// word matches like CombineBinary. We want to find the hit against '[|Bin|]ary' not
// 'Com[|bin|]e'
PopulateCandidateHumps();
for (int i = 0, n = candidateHumps.Count; i < n; i++)
{
var hump = TextSpan.FromBounds(candidateHumps[i].Start, candidateLength);
if (PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.IgnoreCase))
{
return new PatternMatch(PatternMatchKind.StartOfWordSubstring, punctuationStripped,
isCaseSensitive: PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.None),
matchedSpan: GetMatchedSpan(hump.Start, patternChunk.Text.Length));
}
}
}
}
{
// Didn't have an exact/prefix match, or a high enough quality substring match.
// See if we can find a camel case match.
PopulateCandidateHumps();
// Didn't have an exact/prefix match, or a high enough quality substring match.
// See if we can find a camel case match.
var match = TryCamelCaseMatch(candidate, patternChunk, punctuationStripped, patternIsLowercase, candidateHumps);
if (match != null)
return match;
}
// If pattern was all lowercase, we allow it to match an all lowercase section of the candidate. But
// only after we've tried all other forms first. This is the weakest of all matches. For example, if
// user types 'bin' we want to match 'OperatorBinary' (start of word) or 'BinaryInformationNode' (camel
// humps) before matching 'Combine'.
//
// We only do this for strings longer than three characters to avoid too many false positives when the
// user has only barely started writing a word.
if (patternIsLowercase && caseInsensitiveIndex > 0 && patternChunk.Text.Length >= 3)
{
var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, startIndex: caseInsensitiveIndex, CompareOptions.None);
if (caseSensitiveIndex > 0)
{
return new PatternMatch(
PatternMatchKind.LowercaseSubstring, punctuationStripped, isCaseSensitive: true,
matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length));
}
}
return null;
void PopulateCandidateHumps()
{
if (candidateHumps.Count == 0)
StringBreaker.AddWordParts(candidate, ref candidateHumps.AsRef());
}
}
private TextSpan? GetMatchedSpan(int start, int length)
=> _includeMatchedSpans ? new TextSpan(start, length) : null;
private static bool ContainsSpaceOrAsterisk(string text)
{
for (var i = 0; i < text.Length; i++)
{
var ch = text[i];
if (ch is ' ' or '*')
{
return true;
}
}
return false;
}
/// <summary>
/// Internal helper for MatchPatternInternal
/// </summary>
/// <remarks>
/// PERF: Designed to minimize allocations in common cases.
/// If there's no match, then null is returned.
/// If there's a single match, or the caller only wants the first match, then it is returned (as a Nullable)
/// If there are multiple matches, and the caller wants them all, then a List is allocated.
/// </remarks>
/// <param name="candidate">The word being tested.</param>
/// <param name="segment">The segment of the pattern to check against the candidate.</param>
/// <param name="matches">The result array to place the matches in.</param>
/// <param name="fuzzyMatch">If a fuzzy match should be performed</param>
/// <returns>If there's only one match, then the return value is that match. Otherwise it is null.</returns>
private bool MatchPatternSegment(
string candidate,
ref PatternSegment segment,
ref TemporaryArray<PatternMatch> matches,
bool fuzzyMatch)
{
if (fuzzyMatch && !_allowFuzzyMatching)
{
return false;
}
// First check if the segment matches as is. This is also useful if the segment contains
// characters we would normally strip when splitting into parts that we also may want to
// match in the candidate. For example if the segment is "@int" and the candidate is
// "@int", then that will show up as an exact match here.
//
// Note: if the segment contains a space or an asterisk then we must assume that it's a
// multi-word segment.
if (!ContainsSpaceOrAsterisk(segment.TotalTextChunk.Text))
{
var match = MatchPatternChunk(
candidate, ref segment.TotalTextChunk, punctuationStripped: false, fuzzyMatch: fuzzyMatch);
if (match != null)
{
matches.Add(match.Value);
return true;
}
}
// The logic for pattern matching is now as follows:
//
// 1) Break the segment passed in into words. Breaking is rather simple and a
// good way to think about it that if gives you all the individual alphanumeric words
// of the pattern.
//
// 2) For each word try to match the word against the candidate value.
//
// 3) Matching logic is outlined in NonFuzzyMatchPatternChunk. It's not repeated here to
// prevent having multiple places to keep up to date.
//
// Only if all words have some sort of match is the pattern considered matched.
// Special case a simple pattern (alpha-numeric with no spaces). This is the common
// case and we want to prevent unnecessary overhead.
var subWordTextChunks = segment.SubWordTextChunks;
if (subWordTextChunks.Length == 1)
{
var result = MatchPatternChunk(
candidate, ref subWordTextChunks[0], punctuationStripped: true, fuzzyMatch: fuzzyMatch);
if (result == null)
{
return false;
}
matches.Add(result.Value);
return true;
}
else
{
using var tempMatches = TemporaryArray<PatternMatch>.Empty;
for (int i = 0, n = subWordTextChunks.Length; i < n; i++)
{
// Try to match the candidate with this word
var result = MatchPatternChunk(
candidate, ref subWordTextChunks[i], punctuationStripped: true, fuzzyMatch: fuzzyMatch);
if (result == null)
return false;
tempMatches.Add(result.Value);
}
matches.AddRange(tempMatches);
return tempMatches.Count > 0;
}
}
private static bool IsWordChar(char ch)
=> char.IsLetterOrDigit(ch) || ch == '_';
/// <summary>
/// Do the two 'parts' match? i.e. Does the candidate part start with the pattern part?
/// </summary>
/// <param name="candidate">The candidate text</param>
/// <param name="candidatePart">The span within the <paramref name="candidate"/> text</param>
/// <param name="pattern">The pattern text</param>
/// <param name="patternPart">The span within the <paramref name="pattern"/> text</param>
/// <param name="compareOptions">Options for doing the comparison (case sensitive or not)</param>
/// <returns>True if the span identified by <paramref name="candidatePart"/> within <paramref name="candidate"/> starts with
/// the span identified by <paramref name="patternPart"/> within <paramref name="pattern"/>.</returns>
private bool PartStartsWith(string candidate, TextSpan candidatePart, string pattern, TextSpan patternPart, CompareOptions compareOptions)
{
if (patternPart.Length > candidatePart.Length)
{
// Pattern part is longer than the candidate part. There can never be a match.
return false;
}
return _compareInfo.Compare(
candidate, candidatePart.Start, patternPart.Length,
pattern, patternPart.Start, patternPart.Length, compareOptions) == 0;
}
/// <summary>
/// Does the given part start with the given pattern?
/// </summary>
/// <param name="candidate">The candidate text</param>
/// <param name="candidatePart">The span within the <paramref name="candidate"/> text</param>
/// <param name="pattern">The pattern text</param>
/// <param name="compareOptions">Options for doing the comparison (case sensitive or not)</param>
/// <returns>True if the span identified by <paramref name="candidatePart"/> within <paramref name="candidate"/> starts with <paramref name="pattern"/></returns>
private bool PartStartsWith(string candidate, TextSpan candidatePart, string pattern, CompareOptions compareOptions)
=> PartStartsWith(candidate, candidatePart, pattern, new TextSpan(0, pattern.Length), compareOptions);
private PatternMatch? TryCamelCaseMatch(
string candidate,
in TextChunk patternChunk,
bool punctuationStripped,
bool isLowercase,
in TemporaryArray<TextSpan> candidateHumps)
{
if (isLowercase)
{
// e) If the word was entirely lowercase, then attempt a special lower cased camel cased
// match. i.e. cofipro would match CodeFixProvider.
var camelCaseKind = TryAllLowerCamelCaseMatch(
candidate, candidateHumps, patternChunk, out var matchedSpans);
if (camelCaseKind.HasValue)
{
return new PatternMatch(
camelCaseKind.Value, punctuationStripped, isCaseSensitive: false,
matchedSpans: matchedSpans);
}
}
else
{
// f) If the word was not entirely lowercase, then attempt a normal camel cased match.
// i.e. CoFiPro would match CodeFixProvider, but CofiPro would not.
if (patternChunk.PatternHumps.Count > 0)
{
// PERF: This can be called thousands of times per completion session with only a handful of matches found.
// Checking for case insensitive initially reduces the TryUpperCaseCamelCaseMatch call count to 1 for the
// non-matching candidates, but increases the call count to 2 for the much less frequent matching candidates.
var camelCaseKindIgnoreCase = TryUpperCaseCamelCaseMatch(candidate, candidateHumps, patternChunk, CompareOptions.IgnoreCase, out var matchedSpansIgnoreCase);
if (camelCaseKindIgnoreCase.HasValue)
{
var camelCaseKind = TryUpperCaseCamelCaseMatch(candidate, candidateHumps, patternChunk, CompareOptions.None, out var matchedSpans);
if (camelCaseKind.HasValue)
{
return new PatternMatch(
camelCaseKind.Value, punctuationStripped, isCaseSensitive: true,
matchedSpans: matchedSpans);
}
return new PatternMatch(
camelCaseKindIgnoreCase.Value, punctuationStripped, isCaseSensitive: false,
matchedSpans: matchedSpansIgnoreCase);
}
}
}
return null;
}
private PatternMatchKind? TryAllLowerCamelCaseMatch(
string candidate,
in TemporaryArray<TextSpan> candidateHumps,
in TextChunk patternChunk,
out ImmutableArray<TextSpan> matchedSpans)
{
var matcher = new AllLowerCamelCaseMatcher(_includeMatchedSpans, candidate, patternChunk.Text, _textInfo);
return matcher.TryMatch(candidateHumps, out matchedSpans);
}
private PatternMatchKind? TryUpperCaseCamelCaseMatch(
string candidate,
in TemporaryArray<TextSpan> candidateHumps,
in TextChunk patternChunk,
CompareOptions compareOption,
out ImmutableArray<TextSpan> matchedSpans)
{
ref readonly var patternHumps = ref patternChunk.PatternHumps;
// Note: we may have more pattern parts than candidate parts. This is because multiple
// pattern parts may match a candidate part. For example "SiUI" against "SimpleUI".
// We'll have 3 pattern parts Si/U/I against two candidate parts Simple/UI. However, U
// and I will both match in UI.
var currentCandidateHump = 0;
var currentPatternHump = 0;
int? firstMatch = null;
bool? contiguous = null;
var patternHumpCount = patternHumps.Count;
var candidateHumpCount = candidateHumps.Count;
using var matchSpans = TemporaryArray<TextSpan>.Empty;
while (true)
{
// Let's consider our termination cases
if (currentPatternHump == patternHumpCount)
{
Debug.Assert(firstMatch.HasValue);
Debug.Assert(contiguous.HasValue);
var matchCount = matchSpans.Count;
matchedSpans = _includeMatchedSpans
? new NormalizedTextSpanCollection(matchSpans.ToImmutableAndClear()).ToImmutableArray()
: [];
var camelCaseResult = new CamelCaseResult(firstMatch == 0, contiguous.Value, matchCount, null);
return GetCamelCaseKind(camelCaseResult, candidateHumps);
}
else if (currentCandidateHump == candidateHumpCount)
{
// No match, since we still have more of the pattern to hit
matchedSpans = [];
return null;
}
var candidateHump = candidateHumps[currentCandidateHump];
var gotOneMatchThisCandidate = false;
// Consider the case of matching SiUI against SimpleUIElement. The candidate parts
// will be Simple/UI/Element, and the pattern parts will be Si/U/I. We'll match 'Si'
// against 'Simple' first. Then we'll match 'U' against 'UI'. However, we want to
// still keep matching pattern parts against that candidate part.
for (; currentPatternHump < patternHumpCount; currentPatternHump++)
{
var patternChunkCharacterSpan = patternHumps[currentPatternHump];
if (gotOneMatchThisCandidate)
{
// We've already gotten one pattern part match in this candidate. We will
// only continue trying to consume pattern parts if the last part and this
// part are both upper case.
if (!char.IsUpper(patternChunk.Text[patternHumps[currentPatternHump - 1].Start]) ||
!char.IsUpper(patternChunk.Text[patternHumps[currentPatternHump].Start]))
{
break;
}
}
if (!PartStartsWith(candidate, candidateHump, patternChunk.Text, patternChunkCharacterSpan, compareOption))
{
break;
}
matchSpans.Add(new TextSpan(candidateHump.Start, patternChunkCharacterSpan.Length));
gotOneMatchThisCandidate = true;
firstMatch ??= currentCandidateHump;
// If we were contiguous, then keep that value. If we weren't, then keep that
// value. If we don't know, then set the value to 'true' as an initial match is
// obviously contiguous.
contiguous ??= true;
candidateHump = new TextSpan(candidateHump.Start + patternChunkCharacterSpan.Length, candidateHump.Length - patternChunkCharacterSpan.Length);
}
// Check if we matched anything at all. If we didn't, then we need to unset the
// contiguous bit if we currently had it set.
// If we haven't set the bit yet, then that means we haven't matched anything so
// far, and we don't want to change that.
if (!gotOneMatchThisCandidate && contiguous.HasValue)
{
contiguous = false;
}
// Move onto the next candidate.
currentCandidateHump++;
}
}
}
|