|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Text.RegularExpressions;
using Microsoft.CodeAnalysis.EmbeddedLanguages.Common;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Text;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions;
using static EmbeddedSyntaxHelpers;
using static RegexHelpers;
using RegexAlternatingSequenceList = EmbeddedSeparatedSyntaxNodeList<RegexKind, RegexNode, RegexSequenceNode>;
using RegexNodeOrToken = EmbeddedSyntaxNodeOrToken<RegexKind, RegexNode>;
using RegexToken = EmbeddedSyntaxToken<RegexKind>;
/// <summary>
/// Produces a <see cref="RegexTree"/> from a sequence of <see cref="VirtualChar"/> characters.
///
/// Importantly, this parser attempts to replicate diagnostics with almost the exact same text
/// as the native .NET regex parser. This is important so that users get an understandable
/// experience where it appears to them that this is all one cohesive system and that the IDE
/// will let them discover and fix the same issues they would encounter when previously trying
/// to just compile and execute these regexes.
/// </summary>
/// <remarks>
/// Invariants we try to maintain (and should consider a bug if we do not): l 1. If the .NET
/// regex parser does not report an error for a given pattern, we should not either. it would be
/// very bad if we told the user there was something wrong with there pattern when there really
/// wasn't.
///
/// 2. If the .NET regex parser does report an error for a given pattern, we should either not
/// report an error (not recommended) or report the same error at an appropriate location in the
/// pattern. Not reporting the error can be confusing as the user will think their pattern is
/// ok, when it really is not. However, it can be acceptable to do this as it's not telling
/// them that something is actually wrong, and it may be too difficult to find and report the
/// same error. Note: there is only one time we do this in this parser (see the deviation
/// documented in <see cref="ParsePossibleEcmascriptBackreferenceEscape"/>).
///
/// Note1: "report the same error" means that we will attempt to report the error using the same
/// text the .NET regex parser uses for its error messages. This is so that the user is not
/// confused when they use the IDE vs running the regex by getting different messages for the
/// same issue.
///
/// Note2: the above invariants make life difficult at times. This happens due to the fact that
/// the .NET parser is multi-pass. Meaning it does a first scan (which may report errors), then
/// does the full parse. This means that it might report an error in a later location during
/// the initial scan than it would during the parse. We replicate that behavior to follow the
/// second invariant.
///
/// Note3: It would be nice if we could check these invariants at runtime, so we could control
/// our behavior by the behavior of the real .NET regex engine. For example, if the .NET regex
/// engine did not report any issues, we could suppress any diagnostics we generated and we
/// could log an NFW to record which pattern we deviated on so we could fix the issue for a
/// future release. However, we cannot do this as the .NET regex engine has no guarantees about
/// its performance characteristics. For example, certain regex patterns might end up causing
/// that engine to consume unbounded amounts of CPU and memory. This is because the .NET regex
/// engine is not just a parser, but something that builds an actual recognizer using techniques
/// that are not necessarily bounded. As such, while we test ourselves around it during our
/// tests, we cannot do the same at runtime as part of the IDE.
///
/// This parser was based off the corefx RegexParser based at:
/// https://github.com/dotnet/corefx/blob/f759243d724f462da0bcef54e86588f8a55352c6/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1
///
/// Note4: The .NET parser itself changes over time (for example to fix behavior that even it
/// thinks is buggy). When this happens, we have to make a choice as to which behavior to
/// follow. In general, the overall principle is that we should follow the more lenient
/// behavior. If we end up taking the more strict interpretation we risk giving people an error
/// during design time that they would not get at runtime. It's far worse to have that than to
/// not report an error, even though one might happen later.
/// </remarks>
internal partial struct RegexParser
{
private readonly ImmutableDictionary<string, TextSpan> _captureNamesToSpan;
private readonly ImmutableDictionary<int, TextSpan> _captureNumbersToSpan;
private RegexLexer _lexer;
private RegexOptions _options;
private RegexToken _currentToken;
private int _recursionDepth;
private RegexParser(
VirtualCharSequence text, RegexOptions options,
ImmutableDictionary<string, TextSpan> captureNamesToSpan,
ImmutableDictionary<int, TextSpan> captureNumbersToSpan) : this()
{
_lexer = new RegexLexer(text);
_options = options;
_captureNamesToSpan = captureNamesToSpan;
_captureNumbersToSpan = captureNumbersToSpan;
// Get the first token. It is allowed to have trivia on it.
ConsumeCurrentToken(allowTrivia: true);
}
/// <summary>
/// Returns the latest token the lexer has produced, and then asks the lexer to
/// produce the next token after that.
/// </summary>
/// <param name="allowTrivia">Whether or not trivia is allowed on the next token
/// produced. In the .NET parser trivia is only allowed on a few constructs,
/// and our parser mimics that behavior. Note that even if trivia is allowed,
/// the type of trivia that can be scanned depends on the current RegexOptions.
/// For example, if <see cref="RegexOptions.IgnorePatternWhitespace"/> is currently
/// enabled, then '#...' comments are allowed. Otherwise, only '(?#...)' comments
/// are allowed.</param>
private RegexToken ConsumeCurrentToken(bool allowTrivia)
{
var previous = _currentToken;
_currentToken = _lexer.ScanNextToken(allowTrivia, _options);
return previous;
}
/// <summary>
/// Given an input text, and set of options, parses out a fully representative syntax tree
/// and list of diagnostics. Parsing should always succeed, except in the case of the stack
/// overflowing.
/// </summary>
public static RegexTree? TryParse(VirtualCharSequence text, RegexOptions options)
{
if (text.IsDefault)
{
return null;
}
try
{
// Parse the tree once, to figure out the capture groups. These are needed
// to then parse the tree again, as the captures will affect how we interpret
// certain things (i.e. escape references) and what errors will be reported.
//
// This is necessary as .NET regexes allow references to *future* captures.
// As such, we don't know when we're seeing a reference if it's to something
// that exists or not.
var tree1 = new RegexParser(text, options,
ImmutableDictionary<string, TextSpan>.Empty,
ImmutableDictionary<int, TextSpan>.Empty).ParseTree();
var (captureNames, captureNumbers) = CaptureInfoAnalyzer.Analyze(text, tree1.Root, options);
var tree2 = new RegexParser(
text, options, captureNames, captureNumbers).ParseTree();
return tree2;
}
catch (InsufficientExecutionStackException)
{
return null;
}
}
private RegexTree ParseTree()
{
// Most callers to ParseAlternatingSequences are from group constructs. As those
// constructs will have already consumed the open paren, they don't want this sub-call
// to consume through close-paren tokens as they want that token for themselves.
// However, we're the topmost call and have not consumed an open paren. And, we want
// this call to consume all the way to the end, eating up excess close-paren tokens that
// are encountered.
var expression = this.ParseAlternatingSequences(consumeCloseParen: true, isConditional: false);
Debug.Assert(_lexer.Position == _lexer.Text.Length);
Debug.Assert(_currentToken.Kind == RegexKind.EndOfFile);
var root = new RegexCompilationUnit(expression, _currentToken);
using var _1 = PooledHashSet<EmbeddedDiagnostic>.GetInstance(out var seenDiagnostics);
using var _2 = ArrayBuilder<EmbeddedDiagnostic>.GetInstance(out var diagnostics);
CollectDiagnostics(root, seenDiagnostics, diagnostics);
return new RegexTree(
_lexer.Text, root, diagnostics.ToImmutable(),
_captureNamesToSpan, _captureNumbersToSpan);
}
private void CollectDiagnostics(
RegexNode node, HashSet<EmbeddedDiagnostic> seenDiagnostics, ArrayBuilder<EmbeddedDiagnostic> diagnostics)
{
try
{
_recursionDepth++;
StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
CollectDiagnosticsWorker(node, seenDiagnostics, diagnostics);
}
finally
{
_recursionDepth--;
}
}
private void CollectDiagnosticsWorker(RegexNode node, HashSet<EmbeddedDiagnostic> seenDiagnostics, ArrayBuilder<EmbeddedDiagnostic> diagnostics)
{
foreach (var child in node)
{
if (child.IsNode)
{
CollectDiagnostics(child.Node, seenDiagnostics, diagnostics);
}
else
{
var token = child.Token;
foreach (var trivia in token.LeadingTrivia)
{
AddUniqueDiagnostics(seenDiagnostics, trivia.Diagnostics, diagnostics);
}
// We never place trailing trivia on regex tokens.
Debug.Assert(token.TrailingTrivia.IsEmpty);
AddUniqueDiagnostics(seenDiagnostics, token.Diagnostics, diagnostics);
}
}
}
/// <summary>
/// It's very common to have duplicated diagnostics. For example, consider "((". This will
/// have two 'missing )' diagnostics, both at the end. Reporting both isn't helpful, so we
/// filter duplicates out here.
/// </summary>
private static void AddUniqueDiagnostics(
HashSet<EmbeddedDiagnostic> seenDiagnostics, ImmutableArray<EmbeddedDiagnostic> from, ArrayBuilder<EmbeddedDiagnostic> to)
{
foreach (var diagnostic in from)
{
if (seenDiagnostics.Add(diagnostic))
{
to.Add(diagnostic);
}
}
}
private RegexAlternationNode ParseAlternatingSequences(
bool consumeCloseParen, bool isConditional)
{
try
{
_recursionDepth++;
StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
return ParseAlternatingSequencesWorker(consumeCloseParen, isConditional);
}
finally
{
_recursionDepth--;
}
}
/// <summary>
/// Parses out code of the form: ...|...|... This is the type of code you have at the top level of a regex, or
/// inside any grouping construct. Note that sequences can be empty in .NET regex. i.e. the following is
/// legal:
///
/// ...||...
///
/// An empty sequence just means "match at every position in the test string".
/// </summary>
private RegexAlternationNode ParseAlternatingSequencesWorker(
bool consumeCloseParen, bool isConditional)
{
using var _ = ArrayBuilder<RegexNodeOrToken>.GetInstance(out var builder);
builder.Add(ParseSequence(consumeCloseParen));
while (_currentToken.Kind == RegexKind.BarToken)
{
// Trivia allowed between the | and the next token.
var barToken = ConsumeCurrentToken(allowTrivia: true);
if (isConditional && builder.Count >= 3)
{
// a conditional alternative expression only allows two cases (the true and false branches). We
// already have seen both once we have 3 items (`t | f`). Error on any further cases we see.
barToken = barToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Too_many_bars_in_conditional_grouping,
barToken.GetSpan()));
}
builder.Add(barToken);
builder.Add(ParseSequence(consumeCloseParen));
}
return new RegexAlternationNode(new RegexAlternatingSequenceList(builder.ToImmutable()));
}
private RegexSequenceNode ParseSequence(bool consumeCloseParen)
{
using var _1 = ArrayBuilder<RegexExpressionNode>.GetInstance(out var builder);
while (ShouldConsumeSequenceElement(consumeCloseParen))
{
var last = builder.Count == 0 ? null : builder.Last();
builder.Add(ParsePrimaryExpressionAndQuantifiers(last));
}
// We will commonly get tons of text nodes in a row. For example, the regex `abc` will be three text nodes
// in a row. To help save on memory try to merge that into one single text node.
using var _2 = ArrayBuilder<RegexExpressionNode>.GetInstance(out var sequence);
MergeTextNodes(builder, sequence);
return new RegexSequenceNode(sequence.ToImmutable());
}
private static void MergeTextNodes(ArrayBuilder<RegexExpressionNode> list, ArrayBuilder<RegexExpressionNode> final)
{
// Iterate all the nodes in the sequence we have, adding them directly to
// `final` if they are not text nodes. If they are text nodes, we attempt
// to keep merging them with any following text nodes as long as well.
for (var index = 0; index < list.Count;)
{
var current = list[index];
if (current.Kind != RegexKind.Text)
{
// Not a text node. Just add as-is, and move to the next node.
index++;
final.Add(current);
continue;
}
// Got a text node. Try to combine it with all following nodes.
index = MergeAndAddAdjacentTextNodes(list, final, index);
}
return;
// local functions
static int MergeAndAddAdjacentTextNodes(
ArrayBuilder<RegexExpressionNode> list,
ArrayBuilder<RegexExpressionNode> final,
int index)
{
var startIndex = index;
var startTextNode = (RegexTextNode)list[startIndex];
// Keep walking forward as long as we hit text nodes and we can
// merge that text node with the previous text node.
index++;
var lastTextNode = startTextNode;
for (; index < list.Count; index++)
{
var currentNode = list[index];
if (!CanMerge(lastTextNode, currentNode))
{
// Hit something we couldn't merge with our last text node
// Break out and merge what we have so far. 'index' will
// be pointing at the right node for our caller.
break;
}
lastTextNode = (RegexTextNode)currentNode;
}
// If didn't have multiple text nodes in a row. Just return the
// starting node. Otherwise, create one text node that has a token
// that spans from the start of the first node to the end of the last node.
final.Add(startTextNode == lastTextNode
? startTextNode
: new RegexTextNode(CreateToken(
RegexKind.TextToken, startTextNode.TextToken.LeadingTrivia,
VirtualCharSequence.FromBounds(
startTextNode.TextToken.VirtualChars,
lastTextNode.TextToken.VirtualChars))));
return index;
}
// Local functions
static bool CanMerge(RegexTextNode lastNode, RegexExpressionNode next)
{
if (next.Kind == RegexKind.Text)
{
var lastTextToken = lastNode.TextToken;
var nextTextToken = ((RegexTextNode)next).TextToken;
// Can't merge if the next text node has leading trivia. Also, conservatively
// don't allow merging if there are diagnostics or values for these tokens.
// We might be able to support that, but it's easier to not do anything that
// might break an expectation someone might have downstream. /
if (lastTextToken.Diagnostics.Length == 0 &&
nextTextToken.Diagnostics.Length == 0 &&
lastTextToken.Value == null &&
nextTextToken.Value == null &&
nextTextToken.LeadingTrivia.Length == 0)
{
lastTextToken.VirtualChars.AssertAdjacentTo(nextTextToken.VirtualChars);
return true;
}
}
return false;
}
}
private readonly bool ShouldConsumeSequenceElement(bool consumeCloseParen)
{
if (_currentToken.Kind == RegexKind.EndOfFile)
{
return false;
}
if (_currentToken.Kind == RegexKind.BarToken)
{
return false;
}
if (_currentToken.Kind == RegexKind.CloseParenToken)
{
return consumeCloseParen;
}
return true;
}
private RegexExpressionNode ParsePrimaryExpressionAndQuantifiers(RegexExpressionNode? lastExpression)
{
var current = ParsePrimaryExpression(lastExpression);
if (current.Kind == RegexKind.SimpleOptionsGrouping)
{
// Simple options (i.e. "(?i-x)" can't have quantifiers attached to them).
return current;
}
return _currentToken.Kind switch
{
RegexKind.AsteriskToken => ParseZeroOrMoreQuantifier(current),
RegexKind.PlusToken => ParseOneOrMoreQuantifier(current),
RegexKind.QuestionToken => ParseZeroOrOneQuantifier(current),
RegexKind.OpenBraceToken => TryParseNumericQuantifier(current, _currentToken),
_ => current,
};
}
private RegexExpressionNode TryParseLazyQuantifier(RegexQuantifierNode quantifier)
{
if (_currentToken.Kind != RegexKind.QuestionToken)
{
return quantifier;
}
// Whitespace allowed after the question and the next sequence element.
return new RegexLazyQuantifierNode(quantifier,
ConsumeCurrentToken(allowTrivia: true));
}
private RegexExpressionNode ParseZeroOrMoreQuantifier(RegexPrimaryExpressionNode current)
{
// Whitespace allowed between the quantifier and the possible following ? or next sequence item.
return TryParseLazyQuantifier(new RegexZeroOrMoreQuantifierNode(current, ConsumeCurrentToken(allowTrivia: true)));
}
private RegexExpressionNode ParseOneOrMoreQuantifier(RegexPrimaryExpressionNode current)
{
// Whitespace allowed between the quantifier and the possible following ? or next sequence item.
return TryParseLazyQuantifier(new RegexOneOrMoreQuantifierNode(current, ConsumeCurrentToken(allowTrivia: true)));
}
private RegexExpressionNode ParseZeroOrOneQuantifier(RegexPrimaryExpressionNode current)
{
// Whitespace allowed between the quantifier and the possible following ? or next sequence item.
return TryParseLazyQuantifier(new RegexZeroOrOneQuantifierNode(current, ConsumeCurrentToken(allowTrivia: true)));
}
private RegexExpressionNode TryParseNumericQuantifier(
RegexPrimaryExpressionNode expression, RegexToken openBraceToken)
{
var start = _lexer.Position;
if (!TryParseNumericQuantifierParts(
out var firstNumberToken,
out var commaToken,
out var secondNumberToken,
out var closeBraceToken))
{
_currentToken = openBraceToken;
_lexer.Position = start;
return expression;
}
var quantifier = CreateQuantifier(
expression, openBraceToken, firstNumberToken, commaToken,
secondNumberToken, closeBraceToken);
return TryParseLazyQuantifier(quantifier);
}
private static RegexQuantifierNode CreateQuantifier(
RegexPrimaryExpressionNode expression,
RegexToken openBraceToken, RegexToken firstNumberToken, RegexToken? commaToken,
RegexToken? secondNumberToken, RegexToken closeBraceToken)
{
if (commaToken != null)
{
return secondNumberToken != null
? new RegexClosedNumericRangeQuantifierNode(expression, openBraceToken, firstNumberToken, commaToken.Value, secondNumberToken.Value, closeBraceToken)
: new RegexOpenNumericRangeQuantifierNode(expression, openBraceToken, firstNumberToken, commaToken.Value, closeBraceToken);
}
return new RegexExactNumericQuantifierNode(expression, openBraceToken, firstNumberToken, closeBraceToken);
}
private bool TryParseNumericQuantifierParts(
out RegexToken firstNumberToken, out RegexToken? commaToken,
out RegexToken? secondNumberToken, out RegexToken closeBraceToken)
{
firstNumberToken = default;
commaToken = null;
secondNumberToken = null;
closeBraceToken = default;
var firstNumber = _lexer.TryScanNumber();
if (firstNumber == null)
{
return false;
}
firstNumberToken = firstNumber.Value;
// Nothing allowed between {x,n}
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.CommaToken)
{
commaToken = _currentToken;
var start = _lexer.Position;
secondNumberToken = _lexer.TryScanNumber();
if (secondNumberToken == null)
{
// Nothing allowed between {x,n}
ResetToPositionAndConsumeCurrentToken(start, allowTrivia: false);
}
else
{
var secondNumberTokenLocal = secondNumberToken.Value;
// Nothing allowed between {x,n}
ConsumeCurrentToken(allowTrivia: false);
var val1 = (int)firstNumberToken.Value;
var val2 = (int)secondNumberTokenLocal.Value;
if (val2 < val1)
{
secondNumberTokenLocal = secondNumberTokenLocal.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Illegal_x_y_with_x_less_than_y,
secondNumberTokenLocal.GetSpan()));
secondNumberToken = secondNumberTokenLocal;
}
}
}
if (_currentToken.Kind != RegexKind.CloseBraceToken)
{
return false;
}
// Whitespace allowed between the quantifier and the possible following ? or next sequence item.
closeBraceToken = ConsumeCurrentToken(allowTrivia: true);
return true;
}
private void ResetToPositionAndConsumeCurrentToken(int position, bool allowTrivia)
{
_lexer.Position = position;
ConsumeCurrentToken(allowTrivia);
}
private RegexPrimaryExpressionNode ParsePrimaryExpression(RegexExpressionNode? lastExpression)
{
return _currentToken.Kind switch
{
RegexKind.DotToken => ParseWildcard(),
RegexKind.CaretToken => ParseStartAnchor(),
RegexKind.DollarToken => ParseEndAnchor(),
RegexKind.BackslashToken => ParseEscape(_currentToken, allowTriviaAfterEnd: true),
RegexKind.OpenBracketToken => ParseCharacterClass(),
RegexKind.OpenParenToken => ParseGrouping(),
RegexKind.CloseParenToken => ParseUnexpectedCloseParenToken(),
RegexKind.OpenBraceToken => ParsePossibleUnexpectedNumericQuantifier(lastExpression),
RegexKind.AsteriskToken or RegexKind.PlusToken or RegexKind.QuestionToken => ParseUnexpectedQuantifier(lastExpression),
_ => ParseText(),
};
}
private RegexPrimaryExpressionNode ParsePossibleUnexpectedNumericQuantifier(RegexExpressionNode? lastExpression)
{
// Native parser looks for something like {0,1} in a top level sequence and reports
// an explicit error that that's not allowed. However, something like {0, 1} is fine
// and is treated as six textual tokens.
var openBraceToken = _currentToken.With(kind: RegexKind.TextToken);
var start = _lexer.Position;
if (TryParseNumericQuantifierParts(
out _, out _, out _, out _))
{
// Report that a numeric quantifier isn't allowed here.
CheckQuantifierExpression(lastExpression, ref openBraceToken);
}
// Started with { but wasn't a numeric quantifier. This is totally legal and is just
// a textual sequence. Restart, scanning this token as a normal sequence element.
ResetToPositionAndConsumeCurrentToken(start, allowTrivia: true);
return new RegexTextNode(openBraceToken);
}
private RegexPrimaryExpressionNode ParseUnexpectedCloseParenToken()
{
var token = _currentToken.With(kind: RegexKind.TextToken).AddDiagnosticIfNone(
new EmbeddedDiagnostic(FeaturesResources.Too_many_close_parens, _currentToken.GetSpan()));
// Technically, since an error occurred, we can do whatever we want here. However,
// the spirit of the native parser is that top level sequence elements are allowed
// to have trivia. So that's the behavior we mimic.
ConsumeCurrentToken(allowTrivia: true);
return new RegexTextNode(token);
}
private RegexPrimaryExpressionNode ParseText()
{
var token = ConsumeCurrentToken(allowTrivia: true);
Debug.Assert(token.Value == null);
// Allow trivia between this piece of text and the next sequence element
return new RegexTextNode(token.With(kind: RegexKind.TextToken));
}
private RegexPrimaryExpressionNode ParseEndAnchor()
{
// Allow trivia between this anchor and the next sequence element
return new RegexAnchorNode(RegexKind.EndAnchor, ConsumeCurrentToken(allowTrivia: true));
}
private RegexPrimaryExpressionNode ParseStartAnchor()
{
// Allow trivia between this anchor and the next sequence element
return new RegexAnchorNode(RegexKind.StartAnchor, ConsumeCurrentToken(allowTrivia: true));
}
private RegexPrimaryExpressionNode ParseWildcard()
{
// Allow trivia between the . and the next sequence element
return new RegexWildcardNode(ConsumeCurrentToken(allowTrivia: true));
}
private RegexGroupingNode ParseGrouping()
{
var start = _lexer.Position;
// Check what immediately follows the (. If we have (? it is processed specially.
// However, we do not treat (? the same as ( ?
var openParenToken = ConsumeCurrentToken(allowTrivia: false);
switch (_currentToken.Kind)
{
case RegexKind.QuestionToken:
return ParseGroupQuestion(openParenToken, _currentToken);
default:
// Wasn't (? just parse this as a normal group.
_lexer.Position = start;
return ParseSimpleGroup(openParenToken);
}
}
private RegexToken ParseGroupingCloseParen()
{
switch (_currentToken.Kind)
{
case RegexKind.CloseParenToken:
// Grouping completed normally. Allow trivia between it and the next sequence element.
return ConsumeCurrentToken(allowTrivia: true);
default:
return CreateMissingToken(RegexKind.CloseParenToken).AddDiagnosticIfNone(
new EmbeddedDiagnostic(FeaturesResources.Not_enough_close_parens, GetTokenStartPositionSpan(_currentToken)));
}
}
private RegexSimpleGroupingNode ParseSimpleGroup(RegexToken openParenToken)
=> new(
openParenToken, ParseGroupingEmbeddedExpression(_options), ParseGroupingCloseParen());
private RegexExpressionNode ParseGroupingEmbeddedExpression(RegexOptions embeddedOptions)
{
// Save and restore options when we go into, and pop out of a group node.
var currentOptions = _options;
_options = embeddedOptions;
// We're parsing the embedded sequence inside the current group. As this is a sequence
// we want to allow trivia between the current token we're on, and the first token
// of the embedded sequence.
ConsumeCurrentToken(allowTrivia: true);
// When parsing out the sequence don't grab the close paren, that will be for our caller
// to get.
var expression = this.ParseAlternatingSequences(consumeCloseParen: false, isConditional: false);
_options = currentOptions;
return expression;
}
private readonly TextSpan GetTokenSpanIncludingEOF(RegexToken token)
=> token.Kind == RegexKind.EndOfFile
? GetTokenStartPositionSpan(token)
: token.GetSpan();
private readonly TextSpan GetTokenStartPositionSpan(RegexToken token)
{
return token.Kind == RegexKind.EndOfFile
? new TextSpan(_lexer.Text.Last().Span.End, 0)
: new TextSpan(token.VirtualChars[0].Span.Start, 0);
}
private RegexGroupingNode ParseGroupQuestion(RegexToken openParenToken, RegexToken questionToken)
{
var optionsToken = _lexer.TryScanOptions();
if (optionsToken != null)
{
return ParseOptionsGroupingNode(openParenToken, questionToken, optionsToken.Value);
}
var afterQuestionPos = _lexer.Position;
// Lots of possible options when we see (?. Look at the immediately following character
// (without any allowed spaces) to decide what to parse out next.
ConsumeCurrentToken(allowTrivia: false);
switch (_currentToken.Kind)
{
case RegexKind.LessThanToken:
// (?<=...) or (?<!...) or (?<...>...) or (?<...-...>...)
return ParseLookbehindOrNamedCaptureOrBalancingGrouping(openParenToken, questionToken);
case RegexKind.SingleQuoteToken:
// (?'...'...) or (?'...-...'...)
return ParseNamedCaptureOrBalancingGrouping(
openParenToken, questionToken, _currentToken);
case RegexKind.OpenParenToken:
// alternation construct (?(...) | )
return ParseConditionalGrouping(openParenToken, questionToken);
case RegexKind.ColonToken:
return ParseNonCapturingGroupingNode(openParenToken, questionToken);
case RegexKind.EqualsToken:
return ParsePositiveLookaheadGrouping(openParenToken, questionToken);
case RegexKind.ExclamationToken:
return ParseNegativeLookaheadGrouping(openParenToken, questionToken);
case RegexKind.GreaterThanToken:
return ParseAtomicGrouping(openParenToken, questionToken);
default:
if (_currentToken.Kind != RegexKind.CloseParenToken)
{
// Native parser reports "Unrecognized grouping construct", *except* for (?)
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Unrecognized_grouping_construct,
openParenToken.GetSpan()));
}
break;
}
// (?)
// Parse this as a normal group. The question will immediately error as it's a
// quantifier not following anything.
_lexer.Position = afterQuestionPos - 1;
return ParseSimpleGroup(openParenToken);
}
private RegexConditionalGroupingNode ParseConditionalGrouping(RegexToken openParenToken, RegexToken questionToken)
{
var innerOpenParenToken = _currentToken;
var afterInnerOpenParen = _lexer.Position;
var captureToken = _lexer.TryScanNumberOrCaptureName();
if (captureToken == null)
{
return ParseConditionalExpressionGrouping(openParenToken, questionToken);
}
var capture = captureToken.Value;
RegexToken innerCloseParenToken;
if (capture.Kind == RegexKind.NumberToken)
{
// If it's a numeric group, it has to be immediately followed by a ) and the
// numeric reference has to exist.
//
// That means that (?(4 ) is not treated as an embedded expression but as an
// error. This is different from (?(a ) which will be treated as an embedded
// expression, and different from (?(a) will be treated as an embedded
// expression or capture group depending on if 'a' is a existing capture name.
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.CloseParenToken)
{
innerCloseParenToken = _currentToken;
if (!HasCapture((int)capture.Value))
{
capture = capture.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Reference_to_undefined_group,
capture.GetSpan()));
}
}
else
{
innerCloseParenToken = CreateMissingToken(RegexKind.CloseParenToken);
capture = capture.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Malformed,
capture.GetSpan()));
MoveBackBeforePreviousScan();
}
}
else
{
// If it's a capture name, it's ok if that capture doesn't exist. In that case we
// will just treat this as an conditional expression.
if (!HasCapture((string)capture.Value))
{
_lexer.Position = afterInnerOpenParen;
return ParseConditionalExpressionGrouping(openParenToken, questionToken);
}
// Capture name existed. For this to be a capture grouping it exactly has to
// match (?(a) anything other than a close paren after the ) will make this
// into a conditional expression.
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind != RegexKind.CloseParenToken)
{
_lexer.Position = afterInnerOpenParen;
return ParseConditionalExpressionGrouping(openParenToken, questionToken);
}
innerCloseParenToken = _currentToken;
}
// Was (?(name) or (?(num) and name/num was a legal capture name. Parse
// this out as a conditional grouping. Because we're going to be parsing out
// an embedded sequence, allow trivia before the first element.
ConsumeCurrentToken(allowTrivia: true);
var result = ParseConditionalGroupingResult();
return new RegexConditionalCaptureGroupingNode(
openParenToken, questionToken,
innerOpenParenToken, capture, innerCloseParenToken,
result, ParseGroupingCloseParen());
}
private readonly bool HasCapture(int value)
=> _captureNumbersToSpan.ContainsKey(value);
private readonly bool HasCapture(string value)
=> _captureNamesToSpan.ContainsKey(value);
private void MoveBackBeforePreviousScan()
{
if (_currentToken.Kind != RegexKind.EndOfFile)
{
// Move back to un-consume whatever we just consumed.
_lexer.Position--;
}
}
private RegexConditionalGroupingNode ParseConditionalExpressionGrouping(
RegexToken openParenToken, RegexToken questionToken)
{
// Reproduce very specific errors the .NET regex parser looks for. Technically,
// we would error out in these cases no matter what. However, it means we can
// stringently enforce that our parser produces the same errors as the native one.
//
// Move back before the (
_lexer.Position--;
if (_lexer.IsAt("(?#"))
{
var pos = _lexer.Position;
var comment = _lexer.ScanComment(options: default);
Contract.ThrowIfFalse(comment.HasValue);
_lexer.Position = pos;
if (comment.Value.Diagnostics.Length > 0)
{
openParenToken = openParenToken.AddDiagnosticIfNone(comment.Value.Diagnostics[0]);
}
else
{
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Alternation_conditions_cannot_be_comments,
openParenToken.GetSpan()));
}
}
else if (_lexer.IsAt("(?'"))
{
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Alternation_conditions_do_not_capture_and_cannot_be_named,
openParenToken.GetSpan()));
}
else if (_lexer.IsAt("(?<"))
{
if (!_lexer.IsAt("(?<!") &&
!_lexer.IsAt("(?<="))
{
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Alternation_conditions_do_not_capture_and_cannot_be_named,
openParenToken.GetSpan()));
}
}
// Consume the ( once more.
ConsumeCurrentToken(allowTrivia: false);
Debug.Assert(_currentToken.Kind == RegexKind.OpenParenToken);
// Parse out the grouping that starts with the second open paren in (?(
// this will get us to (?(...)
var grouping = ParseGrouping();
// Now parse out the embedded expression that follows that. this will get us to
// (?(...)...
var result = ParseConditionalGroupingResult();
// Finally, grab the close paren and produce (?(...)...)
return new RegexConditionalExpressionGroupingNode(
openParenToken, questionToken,
grouping, result, ParseGroupingCloseParen());
}
private RegexExpressionNode ParseConditionalGroupingResult()
{
var currentOptions = _options;
var result = this.ParseAlternatingSequences(consumeCloseParen: false, isConditional: true);
_options = currentOptions;
return result;
}
private RegexGroupingNode ParseLookbehindOrNamedCaptureOrBalancingGrouping(
RegexToken openParenToken, RegexToken questionToken)
{
var start = _lexer.Position;
// We have (?< Look for (?<= or (?<!
var lessThanToken = ConsumeCurrentToken(allowTrivia: false);
switch (_currentToken.Kind)
{
case RegexKind.EqualsToken:
return new RegexPositiveLookbehindGroupingNode(
openParenToken, questionToken, lessThanToken, _currentToken,
ParseGroupingEmbeddedExpression(_options | RegexOptions.RightToLeft), ParseGroupingCloseParen());
case RegexKind.ExclamationToken:
return new RegexNegativeLookbehindGroupingNode(
openParenToken, questionToken, lessThanToken, _currentToken,
ParseGroupingEmbeddedExpression(_options | RegexOptions.RightToLeft), ParseGroupingCloseParen());
default:
// Didn't have a lookbehind group. Parse out as (?<...> or (?<...-...>
_lexer.Position = start;
return ParseNamedCaptureOrBalancingGrouping(openParenToken, questionToken, lessThanToken);
}
}
private RegexGroupingNode ParseNamedCaptureOrBalancingGrouping(
RegexToken openParenToken, RegexToken questionToken, RegexToken openToken)
{
if (_lexer.Position == _lexer.Text.Length)
{
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Unrecognized_grouping_construct,
GetSpan(openParenToken, openToken)));
}
// (?<...>...) or (?<...-...>...)
// (?'...'...) or (?'...-...'...)
var captureToken = _lexer.TryScanNumberOrCaptureName();
if (captureToken == null)
{
// Can't have any trivia between the elements in this grouping header.
ConsumeCurrentToken(allowTrivia: false);
captureToken = CreateMissingToken(RegexKind.CaptureNameToken);
if (_currentToken.Kind == RegexKind.MinusToken)
{
return ParseBalancingGrouping(
openParenToken, questionToken, openToken, captureToken.Value);
}
else
{
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Invalid_group_name_Group_names_must_begin_with_a_word_character,
GetTokenSpanIncludingEOF(_currentToken)));
// If we weren't at the end of the text, go back to before whatever character
// we just consumed.
MoveBackBeforePreviousScan();
}
}
var capture = captureToken.Value;
if (capture.Kind == RegexKind.NumberToken && (int)capture.Value == 0)
{
capture = capture.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Capture_number_cannot_be_zero,
capture.GetSpan()));
}
// Can't have any trivia between the elements in this grouping header.
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.MinusToken)
{
// Have (?<...- parse out the balancing group form.
return ParseBalancingGrouping(
openParenToken, questionToken,
openToken, capture);
}
var closeToken = ParseCaptureGroupingCloseToken(ref openParenToken, openToken);
return new RegexCaptureGroupingNode(
openParenToken, questionToken,
openToken, capture, closeToken,
ParseGroupingEmbeddedExpression(_options), ParseGroupingCloseParen());
}
private RegexToken ParseCaptureGroupingCloseToken(ref RegexToken openParenToken, RegexToken openToken)
{
if ((openToken.Kind == RegexKind.LessThanToken && _currentToken.Kind == RegexKind.GreaterThanToken) ||
(openToken.Kind == RegexKind.SingleQuoteToken && _currentToken.Kind == RegexKind.SingleQuoteToken))
{
return _currentToken;
}
if (_currentToken.Kind == RegexKind.EndOfFile)
{
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Unrecognized_grouping_construct,
GetSpan(openParenToken, openToken)));
}
else
{
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Invalid_group_name_Group_names_must_begin_with_a_word_character,
_currentToken.GetSpan()));
// Rewind to where we were before seeing this bogus character.
_lexer.Position--;
}
return CreateMissingToken(
openToken.Kind == RegexKind.LessThanToken
? RegexKind.GreaterThanToken : RegexKind.SingleQuoteToken);
}
private RegexBalancingGroupingNode ParseBalancingGrouping(
RegexToken openParenToken, RegexToken questionToken,
RegexToken openToken, RegexToken firstCapture)
{
var minusToken = _currentToken;
var secondCapture = _lexer.TryScanNumberOrCaptureName();
if (secondCapture == null)
{
// Invalid group name: Group names must begin with a word character
ConsumeCurrentToken(allowTrivia: false);
openParenToken = openParenToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Invalid_group_name_Group_names_must_begin_with_a_word_character,
GetTokenSpanIncludingEOF(_currentToken)));
// If we weren't at the end of the text, go back to before whatever character
// we just consumed.
MoveBackBeforePreviousScan();
secondCapture = CreateMissingToken(RegexKind.CaptureNameToken);
}
var second = secondCapture.Value;
CheckCapture(ref second);
// Can't have any trivia between the elements in this grouping header.
ConsumeCurrentToken(allowTrivia: false);
var closeToken = ParseCaptureGroupingCloseToken(ref openParenToken, openToken);
return new RegexBalancingGroupingNode(
openParenToken, questionToken,
openToken, firstCapture, minusToken, second, closeToken,
ParseGroupingEmbeddedExpression(_options), ParseGroupingCloseParen());
}
private readonly void CheckCapture(ref RegexToken captureToken)
{
if (captureToken.IsMissing)
{
// Don't need to check for a synthesized error capture token.
return;
}
if (captureToken.Kind == RegexKind.NumberToken)
{
var val = (int)captureToken.Value;
if (!HasCapture(val))
{
captureToken = captureToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
string.Format(FeaturesResources.Reference_to_undefined_group_number_0, val),
captureToken.GetSpan()));
}
}
else
{
var val = (string)captureToken.Value;
if (!HasCapture(val))
{
captureToken = captureToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
string.Format(FeaturesResources.Reference_to_undefined_group_name_0, val),
captureToken.GetSpan()));
}
}
}
private RegexNonCapturingGroupingNode ParseNonCapturingGroupingNode(RegexToken openParenToken, RegexToken questionToken)
=> new(
openParenToken, questionToken, _currentToken,
ParseGroupingEmbeddedExpression(_options), ParseGroupingCloseParen());
private RegexPositiveLookaheadGroupingNode ParsePositiveLookaheadGrouping(RegexToken openParenToken, RegexToken questionToken)
=> new(
openParenToken, questionToken, _currentToken,
ParseGroupingEmbeddedExpression(_options & ~RegexOptions.RightToLeft), ParseGroupingCloseParen());
private RegexNegativeLookaheadGroupingNode ParseNegativeLookaheadGrouping(RegexToken openParenToken, RegexToken questionToken)
=> new(
openParenToken, questionToken, _currentToken,
ParseGroupingEmbeddedExpression(_options & ~RegexOptions.RightToLeft), ParseGroupingCloseParen());
private RegexAtomicGroupingNode ParseAtomicGrouping(RegexToken openParenToken, RegexToken questionToken)
=> new(
openParenToken, questionToken, _currentToken,
ParseGroupingEmbeddedExpression(_options), ParseGroupingCloseParen());
private RegexGroupingNode ParseOptionsGroupingNode(
RegexToken openParenToken, RegexToken questionToken, RegexToken optionsToken)
{
// Only (?opts:...) or (?opts) are allowed. After the opts must be a : or )
ConsumeCurrentToken(allowTrivia: false);
switch (_currentToken.Kind)
{
case RegexKind.CloseParenToken:
// Allow trivia after the options and the next element in the sequence.
_options = GetNewOptionsFromToken(_options, optionsToken);
return new RegexSimpleOptionsGroupingNode(
openParenToken, questionToken, optionsToken,
ConsumeCurrentToken(allowTrivia: true));
case RegexKind.ColonToken:
return ParseNestedOptionsGroupingNode(openParenToken, questionToken, optionsToken);
default:
return new RegexSimpleOptionsGroupingNode(
openParenToken, questionToken, optionsToken,
CreateMissingToken(RegexKind.CloseParenToken).AddDiagnosticIfNone(
new EmbeddedDiagnostic(FeaturesResources.Unrecognized_grouping_construct, openParenToken.GetSpan())));
}
}
private RegexNestedOptionsGroupingNode ParseNestedOptionsGroupingNode(
RegexToken openParenToken, RegexToken questionToken, RegexToken optionsToken)
=> new(
openParenToken, questionToken, optionsToken, _currentToken,
ParseGroupingEmbeddedExpression(GetNewOptionsFromToken(_options, optionsToken)), ParseGroupingCloseParen());
private static bool IsTextChar(RegexToken currentToken, char ch)
=> currentToken.Kind == RegexKind.TextToken && currentToken.VirtualChars.Length == 1 && currentToken.VirtualChars[0].Value == ch;
private static RegexOptions GetNewOptionsFromToken(RegexOptions currentOptions, RegexToken optionsToken)
{
var copy = currentOptions;
var on = true;
foreach (var ch in optionsToken.VirtualChars)
{
switch (ch.Value)
{
case '-': on = false; break;
case '+': on = true; break;
default:
var newOption = OptionFromCode(ch);
if (on)
{
copy |= newOption;
}
else
{
copy &= ~newOption;
}
break;
}
}
return copy;
}
private static RegexOptions OptionFromCode(VirtualChar ch)
{
switch (ch.Value)
{
case 'i': case 'I': return RegexOptions.IgnoreCase;
case 'm': case 'M': return RegexOptions.Multiline;
case 'n': case 'N': return RegexOptions.ExplicitCapture;
case 's': case 'S': return RegexOptions.Singleline;
case 'x': case 'X': return RegexOptions.IgnorePatternWhitespace;
default:
throw new InvalidOperationException();
}
}
private RegexBaseCharacterClassNode ParseCharacterClass()
{
var openBracketToken = _currentToken;
Debug.Assert(openBracketToken.Kind == RegexKind.OpenBracketToken);
var caretToken = CreateMissingToken(RegexKind.CaretToken);
var closeBracketToken = CreateMissingToken(RegexKind.CloseBracketToken);
// trivia is not allowed anywhere in a character class
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.CaretToken)
{
caretToken = _currentToken;
}
else
{
MoveBackBeforePreviousScan();
}
// trivia is not allowed anywhere in a character class
ConsumeCurrentToken(allowTrivia: false);
using var _1 = ArrayBuilder<RegexExpressionNode>.GetInstance(out var builder);
while (_currentToken.Kind != RegexKind.EndOfFile)
{
Debug.Assert(_currentToken.VirtualChars.Length == 1);
if (_currentToken.Kind == RegexKind.CloseBracketToken && builder.Count > 0)
{
// Allow trivia after the character class, and whatever is next in the sequence.
closeBracketToken = ConsumeCurrentToken(allowTrivia: true);
break;
}
ParseCharacterClassComponents(builder);
}
// We will commonly get tons of text nodes in a row. For example, the regex `[abc]` will be three text
// nodes in a row. To help save on memory try to merge that into one single text node.
using var _2 = ArrayBuilder<RegexExpressionNode>.GetInstance(out var contents);
MergeTextNodes(builder, contents);
if (closeBracketToken.IsMissing)
{
closeBracketToken = closeBracketToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Unterminated_character_class_set,
GetTokenStartPositionSpan(_currentToken)));
}
var components = new RegexSequenceNode(contents.ToImmutable());
return caretToken.IsMissing
? new RegexCharacterClassNode(openBracketToken, components, closeBracketToken)
: new RegexNegatedCharacterClassNode(openBracketToken, caretToken, components, closeBracketToken);
}
private void ParseCharacterClassComponents(ArrayBuilder<RegexExpressionNode> components)
{
var left = ParseSingleCharacterClassComponent(isFirst: components.Count == 0, afterRangeMinus: false);
if (left.Kind is RegexKind.CharacterClassEscape or RegexKind.CategoryEscape ||
IsEscapedMinus(left))
{
// \s or \p{Lu} or \- on the left of a minus doesn't start a range. If there is a following
// minus, it's just treated textually.
components.Add(left);
return;
}
if (_currentToken.Kind == RegexKind.MinusToken && !_lexer.IsAt("]"))
{
// trivia is not allowed anywhere in a character class
var minusToken = ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.OpenBracketToken)
{
components.Add(left);
components.Add(ParseCharacterClassSubtractionNode(minusToken));
}
else
{
// Note that behavior of parsing here changed in .net. See issue:
// https://github.com/dotnet/corefx/issues/31786
//
// We follow the latest behavior in .net which parses things correctly.
var right = ParseSingleCharacterClassComponent(isFirst: false, afterRangeMinus: true);
if (TryGetRangeComponentValue(left, out var leftCh) &&
TryGetRangeComponentValue(right, out var rightCh) &&
leftCh > rightCh)
{
minusToken = minusToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.x_y_range_in_reverse_order,
minusToken.GetSpan()));
}
components.Add(new RegexCharacterClassRangeNode(left, minusToken, right));
}
}
else
{
components.Add(left);
}
}
private static bool IsEscapedMinus([NotNullWhen(true)] RegexNode? node)
=> node is RegexSimpleEscapeNode simple && IsTextChar(simple.TypeToken, '-');
private static bool TryGetRangeComponentValue(RegexExpressionNode component, out int ch)
{
// Don't bother examining the component if it has any errors already. This also means
// we don't have to worry about running into invalid escape sequences and the like.
if (!HasProblem(component))
{
return TryGetRangeComponentValueWorker(component, out ch);
}
ch = default;
return false;
}
private static bool TryGetRangeComponentValueWorker(RegexNode component, out int ch)
{
switch (component.Kind)
{
case RegexKind.SimpleEscape:
var escapeNode = (RegexSimpleEscapeNode)component;
ch = MapEscapeChar(escapeNode.TypeToken.VirtualChars[0]).Value;
return true;
case RegexKind.ControlEscape:
var controlEscape = (RegexControlEscapeNode)component;
var controlCh = controlEscape.ControlToken.VirtualChars[0].Value;
// \ca interpreted as \cA
if (controlCh is >= 'a' and <= 'z')
{
controlCh -= (char)('a' - 'A');
}
// The control characters have values mapping from the A-Z range to numeric
// values 1-26. So, to map that, we subtract 'A' from the value (which would
// give us 0-25) and then add '1' back to it.
ch = controlCh - 'A' + 1;
return true;
case RegexKind.OctalEscape:
ch = GetCharValue(((RegexOctalEscapeNode)component).OctalText, withBase: 8);
return true;
case RegexKind.HexEscape:
ch = GetCharValue(((RegexHexEscapeNode)component).HexText, withBase: 16);
return true;
case RegexKind.UnicodeEscape:
ch = GetCharValue(((RegexUnicodeEscapeNode)component).HexText, withBase: 16);
return true;
case RegexKind.PosixProperty:
// When the native parser sees [:...:] it treats this as if it just saw '[' and skipped the
// rest.
ch = '[';
return true;
case RegexKind.Text:
ch = ((RegexTextNode)component).TextToken.VirtualChars[0].Value;
return true;
case RegexKind.Sequence:
var sequence = (RegexSequenceNode)component;
#if DEBUG
Debug.Assert(sequence.ChildCount > 0);
for (int i = 0, n = sequence.ChildCount - 1; i < n; i++)
Debug.Assert(IsEscapedMinus(sequence.ChildAt(i).Node));
#endif
var last = sequence.ChildAt(sequence.ChildCount - 1).Node;
Contract.ThrowIfNull(last);
if (IsEscapedMinus(last))
break;
return TryGetRangeComponentValueWorker(last, out ch);
}
ch = default;
return false;
}
private static int GetCharValue(RegexToken hexText, int withBase)
{
unchecked
{
var total = 0;
foreach (var vc in hexText.VirtualChars)
{
total *= withBase;
total += HexValue(vc);
}
return total;
}
}
private static int HexValue(VirtualChar ch)
{
Debug.Assert(RegexLexer.IsHexChar(ch));
unchecked
{
var temp = ch.Value - '0';
if (temp is >= 0 and <= 9)
return temp;
temp = ch.Value - 'a';
if (temp is >= 0 and <= 5)
return temp + 10;
temp = ch.Value - 'A';
if (temp is >= 0 and <= 5)
return temp + 10;
}
throw new InvalidOperationException();
}
private static bool HasProblem(RegexNodeOrToken component)
{
if (component.IsNode)
{
foreach (var child in component.Node)
{
if (HasProblem(child))
{
return true;
}
}
}
else
{
var token = component.Token;
if (token.IsMissing ||
token.Diagnostics.Length > 0)
{
return true;
}
foreach (var trivia in token.LeadingTrivia)
{
if (trivia.Diagnostics.Length > 0)
{
return true;
}
}
}
return false;
}
private RegexPrimaryExpressionNode ParseSingleCharacterClassComponent(bool isFirst, bool afterRangeMinus)
{
if (_currentToken.Kind == RegexKind.BackslashToken && _lexer.Position < _lexer.Text.Length)
{
var backslashToken = _currentToken;
// trivia is not allowed anywhere in a character class, and definitely not between
// a \ and the following character.
ConsumeCurrentToken(allowTrivia: false);
Debug.Assert(_currentToken.VirtualChars.Length == 1);
var nextChar = _currentToken.VirtualChars[0];
switch (nextChar.Value)
{
case 'D':
case 'd':
case 'S':
case 's':
case 'W':
case 'w':
case 'p':
case 'P':
if (afterRangeMinus)
{
backslashToken = backslashToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
string.Format(FeaturesResources.Cannot_include_class_0_in_character_range, nextChar),
GetSpan(backslashToken, _currentToken)));
}
// move back before the character we just scanned.
// trivia is not allowed anywhere in a character class.
// The above list are character class and category escapes. ParseEscape can
// handle both of those, so we just defer to it.
_lexer.Position--;
return ParseEscape(backslashToken, allowTriviaAfterEnd: false);
case '-':
// trivia is not allowed anywhere in a character class.
// We just let the basic consumption code pull out a token for us, we then
// convert that to text since we treat all characters after the - as text no
// matter what.
return new RegexSimpleEscapeNode(
backslashToken, ConsumeCurrentToken(allowTrivia: false).With(kind: RegexKind.TextToken));
default:
// trivia is not allowed anywhere in a character class.
// Note: it is very intentional that we're calling ParseCharEscape and not
// ParseEscape. Normal escapes are not interpreted the same way inside a
// character class. For example \b is not an anchor in a character class.
// And things like \k'...' are not k-captures, etc. etc.
_lexer.Position--;
return ParseCharEscape(backslashToken, allowTriviaAfterEnd: false);
}
}
if (!afterRangeMinus &&
!isFirst &&
_currentToken.Kind == RegexKind.MinusToken &&
_lexer.IsAt("["))
{
// have a trailing subtraction.
// trivia is not allowed anywhere in a character class
return ParseCharacterClassSubtractionNode(
ConsumeCurrentToken(allowTrivia: false));
}
// From the .NET regex code:
// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
// It currently doesn't do anything other than skip the whole thing!
if (!afterRangeMinus && _currentToken.Kind == RegexKind.OpenBracketToken && _lexer.IsAt(":"))
{
var beforeBracketPos = _lexer.Position - 1;
// trivia is not allowed anywhere in a character class
ConsumeCurrentToken(allowTrivia: false);
var captureName = _lexer.TryScanCaptureName();
if (captureName.HasValue && _lexer.IsAt(":]"))
{
_lexer.Position += 2;
var textChars = _lexer.GetSubPattern(beforeBracketPos, _lexer.Position);
var token = CreateToken(RegexKind.TextToken, [], textChars);
// trivia is not allowed anywhere in a character class
ConsumeCurrentToken(allowTrivia: false);
return new RegexPosixPropertyNode(token);
}
else
{
// Reset to back where we were.
// trivia is not allowed anywhere in a character class
_lexer.Position = beforeBracketPos;
ConsumeCurrentToken(allowTrivia: false);
Debug.Assert(_currentToken.Kind == RegexKind.OpenBracketToken);
}
}
// trivia is not allowed anywhere in a character class
return new RegexTextNode(
ConsumeCurrentToken(allowTrivia: false).With(kind: RegexKind.TextToken));
}
private RegexPrimaryExpressionNode ParseCharacterClassSubtractionNode(RegexToken minusToken)
{
var charClass = ParseCharacterClass();
if (_currentToken.Kind is not RegexKind.CloseBracketToken and not RegexKind.EndOfFile)
{
minusToken = minusToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.A_subtraction_must_be_the_last_element_in_a_character_class,
GetTokenStartPositionSpan(minusToken)));
}
return new RegexCharacterClassSubtractionNode(minusToken, charClass);
}
/// <summary>
/// Parses out an escape sequence. Escape sequences are allowed in top level sequences
/// and in character classes. In a top level sequence trivia will be allowed afterwards,
/// but in a character class trivia is not allowed afterwards.
/// </summary>
private RegexEscapeNode ParseEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
Debug.Assert(_lexer.Text[_lexer.Position - 1] == '\\');
// No spaces between \ and next char.
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.EndOfFile)
{
backslashToken = backslashToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Illegal_backslash_at_end_of_pattern,
backslashToken.GetSpan()));
return new RegexSimpleEscapeNode(backslashToken, CreateMissingToken(RegexKind.TextToken));
}
Debug.Assert(_currentToken.VirtualChars.Length == 1);
switch (_currentToken.VirtualChars[0].Value)
{
case 'b':
case 'B':
case 'A':
case 'G':
case 'Z':
case 'z':
return new RegexAnchorEscapeNode(
backslashToken, ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd));
case 'w':
case 'W':
case 's':
case 'S':
case 'd':
case 'D':
return new RegexCharacterClassEscapeNode(
backslashToken, ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd));
case 'p':
case 'P':
return ParseCategoryEscape(backslashToken, allowTriviaAfterEnd);
}
// Move back to after the backslash
_lexer.Position--;
return ParseBasicBackslash(backslashToken, allowTriviaAfterEnd);
}
private RegexEscapeNode ParseBasicBackslash(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
Debug.Assert(_lexer.Text[_lexer.Position - 1] == '\\');
// No spaces between \ and next char.
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.EndOfFile)
{
backslashToken = backslashToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Illegal_backslash_at_end_of_pattern,
backslashToken.GetSpan()));
return new RegexSimpleEscapeNode(backslashToken, CreateMissingToken(RegexKind.TextToken));
}
Debug.Assert(_currentToken.VirtualChars.Length == 1);
var ch = _currentToken.VirtualChars[0];
if (ch == 'k')
{
return ParsePossibleKCaptureEscape(backslashToken, allowTriviaAfterEnd);
}
if (ch.Value is '<' or '\'')
{
_lexer.Position--;
return ParsePossibleCaptureEscape(backslashToken, allowTriviaAfterEnd);
}
if (ch.Value is >= '1' and <= '9')
{
_lexer.Position--;
return ParsePossibleBackreferenceEscape(backslashToken, allowTriviaAfterEnd);
}
_lexer.Position--;
return ParseCharEscape(backslashToken, allowTriviaAfterEnd);
}
private RegexEscapeNode ParsePossibleBackreferenceEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
Debug.Assert(_lexer.Text[_lexer.Position - 1] == '\\');
return HasOption(_options, RegexOptions.ECMAScript)
? ParsePossibleEcmascriptBackreferenceEscape(backslashToken, allowTriviaAfterEnd)
: ParsePossibleRegularBackreferenceEscape(backslashToken, allowTriviaAfterEnd);
}
private RegexEscapeNode ParsePossibleEcmascriptBackreferenceEscape(
RegexToken backslashToken, bool allowTriviaAfterEnd)
{
// Small deviation: Ecmascript allows references only to captures that precede
// this position (unlike .NET which allows references in any direction). However,
// because we don't track position, we just consume the entire back-reference.
//
// This is addressable if we add position tracking when we locate all the captures.
Debug.Assert(_lexer.Text[_lexer.Position - 1] == '\\');
var start = _lexer.Position;
var bestPosition = -1;
var capVal = 0;
while (_lexer.Position < _lexer.Text.Length &&
_lexer.Text[_lexer.Position] is var ch &&
(ch >= '0' && ch <= '9'))
{
unchecked
{
capVal *= 10;
capVal += (ch.Value - '0');
}
_lexer.Position++;
if (HasCapture(capVal))
{
bestPosition = _lexer.Position;
}
}
if (bestPosition != -1)
{
var numberToken = CreateToken(
RegexKind.NumberToken, [],
_lexer.GetSubPattern(start, bestPosition)).With(value: capVal);
ResetToPositionAndConsumeCurrentToken(bestPosition, allowTrivia: allowTriviaAfterEnd);
return new RegexBackreferenceEscapeNode(backslashToken, numberToken);
}
_lexer.Position = start;
return ParseCharEscape(backslashToken, allowTriviaAfterEnd);
}
private RegexEscapeNode ParsePossibleRegularBackreferenceEscape(
RegexToken backslashToken, bool allowTriviaAfterEnd)
{
Debug.Assert(_lexer.Text[_lexer.Position - 1] == '\\');
var start = _lexer.Position;
var number = _lexer.TryScanNumber();
Contract.ThrowIfNull(number);
var numberToken = number.Value;
var capVal = (int)numberToken.Value;
if (HasCapture(capVal) ||
capVal <= 9)
{
CheckCapture(ref numberToken);
ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd);
return new RegexBackreferenceEscapeNode(backslashToken, numberToken);
}
_lexer.Position = start;
return ParseCharEscape(backslashToken, allowTriviaAfterEnd);
}
private RegexEscapeNode ParsePossibleCaptureEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
Debug.Assert(_lexer.Text[_lexer.Position - 1] == '\\');
Debug.Assert(_lexer.Text[_lexer.Position].Value is '<' or '\'');
var afterBackslashPosition = _lexer.Position;
ScanCaptureParts(allowTriviaAfterEnd, out var openToken, out var capture, out var closeToken);
if (openToken.IsMissing || capture.IsMissing || closeToken.IsMissing)
{
_lexer.Position = afterBackslashPosition;
return ParseCharEscape(backslashToken, allowTriviaAfterEnd);
}
return new RegexCaptureEscapeNode(
backslashToken, openToken, capture, closeToken);
}
private RegexEscapeNode ParsePossibleKCaptureEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
var typeToken = _currentToken;
var afterBackslashPosition = _lexer.Position - @"k".Length;
ScanCaptureParts(allowTriviaAfterEnd, out var openToken, out var capture, out var closeToken);
if (openToken.IsMissing)
{
backslashToken = backslashToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Malformed_named_back_reference,
GetSpan(backslashToken, typeToken)));
return new RegexSimpleEscapeNode(backslashToken, typeToken.With(kind: RegexKind.TextToken));
}
if (capture.IsMissing || closeToken.IsMissing)
{
// Native parser falls back to normal escape scanning, if it doesn't see a capture,
// or close brace. For normal .NET regexes, this will then fail later (as \k is not
// a legal escape), but will succeed for ecmascript regexes.
_lexer.Position = afterBackslashPosition;
return ParseCharEscape(backslashToken, allowTriviaAfterEnd);
}
return new RegexKCaptureEscapeNode(
backslashToken, typeToken, openToken, capture, closeToken);
}
private void ScanCaptureParts(
bool allowTriviaAfterEnd, out RegexToken openToken, out RegexToken capture, out RegexToken closeToken)
{
openToken = CreateMissingToken(RegexKind.LessThanToken);
capture = CreateMissingToken(RegexKind.CaptureNameToken);
closeToken = CreateMissingToken(RegexKind.GreaterThanToken);
// No trivia allowed in <cap> or 'cap'
ConsumeCurrentToken(allowTrivia: false);
if (_lexer.Position < _lexer.Text.Length &&
(_currentToken.Kind == RegexKind.LessThanToken || _currentToken.Kind == RegexKind.SingleQuoteToken))
{
openToken = _currentToken;
}
else
{
return;
}
var captureToken = _lexer.TryScanNumberOrCaptureName();
capture = captureToken == null
? CreateMissingToken(RegexKind.CaptureNameToken)
: captureToken.Value;
// No trivia allowed in <cap> or 'cap'
ConsumeCurrentToken(allowTrivia: false);
closeToken = CreateMissingToken(RegexKind.GreaterThanToken);
if (!capture.IsMissing &&
((openToken.Kind == RegexKind.LessThanToken && _currentToken.Kind == RegexKind.GreaterThanToken) ||
(openToken.Kind == RegexKind.SingleQuoteToken && _currentToken.Kind == RegexKind.SingleQuoteToken)))
{
CheckCapture(ref capture);
closeToken = ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd);
}
}
private RegexEscapeNode ParseCharEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
Debug.Assert(_lexer.Text[_lexer.Position - 1] == '\\');
// no trivia between \ and the next char
ConsumeCurrentToken(allowTrivia: false);
Debug.Assert(_currentToken.VirtualChars.Length == 1);
var ch = _currentToken.VirtualChars[0];
if (ch.Value is >= '0' and <= '7')
{
_lexer.Position--;
var octalDigits = _lexer.ScanOctalCharacters(_options);
Debug.Assert(octalDigits.VirtualChars.Length > 0);
ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd);
return new RegexOctalEscapeNode(backslashToken, octalDigits);
}
switch (ch.Value)
{
case 'a':
case 'b':
case 'e':
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
return new RegexSimpleEscapeNode(
backslashToken, ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd));
case 'x':
return ParseHexEscape(backslashToken, allowTriviaAfterEnd);
case 'u':
return ParseUnicodeEscape(backslashToken, allowTriviaAfterEnd);
case 'c':
return ParseControlEscape(backslashToken, allowTriviaAfterEnd);
default:
var typeToken = ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd).With(kind: RegexKind.TextToken);
if (!HasOption(_options, RegexOptions.ECMAScript) && RegexCharClass.IsBoundaryWordChar(ch))
{
typeToken = typeToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
string.Format(FeaturesResources.Unrecognized_escape_sequence_0, ch),
typeToken.GetSpan()));
}
return new RegexSimpleEscapeNode(backslashToken, typeToken);
}
}
private RegexEscapeNode ParseUnicodeEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
var typeToken = _currentToken;
var hexChars = _lexer.ScanHexCharacters(4);
ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd);
return new RegexUnicodeEscapeNode(backslashToken, typeToken, hexChars);
}
private RegexEscapeNode ParseHexEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
var typeToken = _currentToken;
var hexChars = _lexer.ScanHexCharacters(2);
ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd);
return new RegexHexEscapeNode(backslashToken, typeToken, hexChars);
}
private RegexControlEscapeNode ParseControlEscape(RegexToken backslashToken, bool allowTriviaAfterEnd)
{
// Nothing allowed between \c and the next char
var typeToken = ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind == RegexKind.EndOfFile)
{
typeToken = typeToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Missing_control_character,
typeToken.GetSpan()));
return new RegexControlEscapeNode(backslashToken, typeToken, CreateMissingToken(RegexKind.TextToken));
}
Debug.Assert(_currentToken.VirtualChars.Length == 1);
var ch = _currentToken.VirtualChars[0].Value;
unchecked
{
// From: https://github.com/dotnet/corefx/blob/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1450
// Note: Roslyn accepts a control escape that current .NET parser does not.
// Specifically: \c[
//
// It is a bug that the .NET parser does not support this construct. The bug was
// reported at: https://github.com/dotnet/corefx/issues/26501 and was fixed for
// CoreFx with https://github.com/dotnet/corefx/commit/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7
//
// Because it was a bug, we follow the correct behavior. That means we will not
// report a diagnostic for a Regex that someone might run on a previous version of
// .NET that ends up throwing at runtime. That's acceptable. Our goal is to match
// the latest .NET 'correct' behavior. Not intermediary points with bugs that have
// since been fixed.
// \ca interpreted as \cA
if (ch is >= 'a' and <= 'z')
{
ch -= (char)('a' - 'A');
}
if (ch is >= '@' and <= '_')
{
var controlToken = ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd).With(kind: RegexKind.TextToken);
return new RegexControlEscapeNode(backslashToken, typeToken, controlToken);
}
else
{
typeToken = typeToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Unrecognized_control_character,
_currentToken.GetSpan()));
// Don't consume the bogus control character.
return new RegexControlEscapeNode(backslashToken, typeToken, CreateMissingToken(RegexKind.TextToken));
}
}
}
private RegexEscapeNode ParseCategoryEscape(RegexToken backslash, bool allowTriviaAfterEnd)
{
Debug.Assert(_lexer.Text[_lexer.Position - 1] is var ch && (ch == 'P' || ch == 'p'));
var typeToken = _currentToken;
var start = _lexer.Position;
if (!TryGetCategoryEscapeParts(
allowTriviaAfterEnd,
out var openBraceToken,
out var categoryToken,
out var closeBraceToken,
out var message))
{
ResetToPositionAndConsumeCurrentToken(start, allowTrivia: allowTriviaAfterEnd);
typeToken = typeToken.With(kind: RegexKind.TextToken).AddDiagnosticIfNone(new EmbeddedDiagnostic(
message, GetSpan(backslash, typeToken)));
return new RegexSimpleEscapeNode(backslash, typeToken);
}
return new RegexCategoryEscapeNode(backslash, typeToken, openBraceToken, categoryToken, closeBraceToken);
}
private bool TryGetCategoryEscapeParts(
bool allowTriviaAfterEnd,
out RegexToken openBraceToken,
out RegexToken categoryToken,
out RegexToken closeBraceToken,
[NotNullWhen(false)] out string? message)
{
openBraceToken = default;
categoryToken = default;
closeBraceToken = default;
message = null;
if (_lexer.Text.Length - _lexer.Position < "{x}".Length)
{
message = FeaturesResources.Incomplete_character_escape;
return false;
}
// no whitespace in \p{x}
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind != RegexKind.OpenBraceToken)
{
message = FeaturesResources.Malformed_character_escape;
return false;
}
openBraceToken = _currentToken;
var category = _lexer.TryScanEscapeCategory();
// no whitespace in \p{x}
ConsumeCurrentToken(allowTrivia: false);
if (_currentToken.Kind != RegexKind.CloseBraceToken)
{
message = FeaturesResources.Incomplete_character_escape;
return false;
}
if (category == null)
{
message = FeaturesResources.Unknown_property;
return false;
}
categoryToken = category.Value;
closeBraceToken = ConsumeCurrentToken(allowTrivia: allowTriviaAfterEnd);
return true;
}
private RegexTextNode ParseUnexpectedQuantifier(RegexExpressionNode? lastExpression)
{
// This is just a bogus element in the higher level sequence. Allow trivia
// after this to abide by the spirit of the native parser.
var token = ConsumeCurrentToken(allowTrivia: true);
CheckQuantifierExpression(lastExpression, ref token);
return new RegexTextNode(token.With(kind: RegexKind.TextToken));
}
private static void CheckQuantifierExpression(RegexExpressionNode? current, ref RegexToken token)
{
if (current == null ||
current.Kind == RegexKind.SimpleOptionsGrouping)
{
token = token.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Quantifier_x_y_following_nothing, token.GetSpan()));
}
else if (current is RegexQuantifierNode or RegexLazyQuantifierNode)
{
token = token.AddDiagnosticIfNone(new EmbeddedDiagnostic(
string.Format(FeaturesResources.Nested_quantifier_0, token.VirtualChars.First()), token.GetSpan()));
}
}
}
|