File: EmbeddedLanguages\RegularExpressions\CSharpRegexParserTests.cs
Web Access
Project: src\src\EditorFeatures\CSharpTest2\Microsoft.CodeAnalysis.CSharp.EditorFeatures2.UnitTests.csproj (Microsoft.CodeAnalysis.CSharp.EditorFeatures2.UnitTests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
#nullable disable
 
using System;
using System.Collections.Immutable;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading;
using System.Xml.Linq;
using Microsoft.CodeAnalysis.CSharp.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.EmbeddedLanguages.Common;
using Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.Text;
using Xunit;
 
namespace Microsoft.CodeAnalysis.CSharp.UnitTests.EmbeddedLanguages.RegularExpressions
{
    using RegexToken = EmbeddedSyntaxToken<RegexKind>;
    using RegexTrivia = EmbeddedSyntaxTrivia<RegexKind>;
 
    public partial class CSharpRegexParserTests
    {
        private readonly IVirtualCharService _service = CSharpVirtualCharService.Instance;
        private const string _statmentPrefix = "var v = ";
 
        private static SyntaxToken GetStringToken(string text)
        {
            var statement = _statmentPrefix + text;
            var parsedStatement = SyntaxFactory.ParseStatement(statement);
            var token = parsedStatement.DescendantTokens().ToArray()[3];
            Assert.True(token.Kind() == SyntaxKind.StringLiteralToken);
 
            return token;
        }
 
        private void Test(string stringText, string expected, RegexOptions options,
            bool runSubTreeTests = true,
            bool allowIndexOutOfRange = false,
            bool allowNullReference = false,
            bool allowOutOfMemory = false,
            bool allowDiagnosticsMismatch = false)
        {
            var (tree, sourceText) = TryParseTree(stringText, options, conversionFailureOk: false,
                allowIndexOutOfRange, allowNullReference, allowOutOfMemory, allowDiagnosticsMismatch);
 
            // Tests are allowed to not run the subtree tests.  This is because some
            // subtrees can cause the native regex parser to exhibit very bad behavior
            // (like not ever actually finishing compiling).
            if (runSubTreeTests)
            {
                TryParseSubTrees(stringText, options,
                    allowIndexOutOfRange, allowNullReference, allowOutOfMemory, allowDiagnosticsMismatch);
            }
 
            const string DoubleQuoteEscaping = "\"\"";
            var actual = TreeToText(sourceText, tree)
                .Replace("\"", DoubleQuoteEscaping)
                .Replace("&quot;", DoubleQuoteEscaping);
            Assert.Equal(expected.Replace("\"", DoubleQuoteEscaping), actual);
        }
 
        private void TryParseSubTrees(
            string stringText, RegexOptions options,
            bool allowIndexOutOfRange,
            bool allowNullReference,
            bool allowOutOfMemory,
            bool allowDiagnosticsMismatch)
        {
            // Trim the input from the right and make sure tree invariants hold
            var current = stringText;
            while (current is not "@\"\"" and not "\"\"")
            {
                current = current[..^2] + "\"";
                TryParseTree(current, options, conversionFailureOk: true,
                    allowIndexOutOfRange, allowNullReference, allowOutOfMemory, allowDiagnosticsMismatch);
            }
 
            // Trim the input from the left and make sure tree invariants hold
            current = stringText;
            while (current is not "@\"\"" and not "\"\"")
            {
                if (current[0] == '@')
                {
                    current = "@\"" + current[3..];
                }
                else
                {
                    current = "\"" + current[2..];
                }
 
                TryParseTree(current, options, conversionFailureOk: true,
                    allowIndexOutOfRange, allowNullReference, allowOutOfMemory, allowDiagnosticsMismatch);
            }
 
            for (var start = stringText[0] == '@' ? 2 : 1; start < stringText.Length - 1; start++)
            {
                TryParseTree(
                    stringText[..start] +
                    stringText[(start + 1)..],
                    options, conversionFailureOk: true,
                    allowIndexOutOfRange, allowNullReference, allowOutOfMemory, allowDiagnosticsMismatch);
            }
        }
 
        private (SyntaxToken, RegexTree, VirtualCharSequence) JustParseTree(
            string stringText, RegexOptions options, bool conversionFailureOk)
        {
            var token = GetStringToken(stringText);
            var allChars = _service.TryConvertToVirtualChars(token);
            if (allChars.IsDefault)
            {
                Assert.True(conversionFailureOk, "Failed to convert text to token.");
                return (token, null, allChars);
            }
 
            var tree = RegexParser.TryParse(allChars, options);
            return (token, tree, allChars);
        }
 
        private (RegexTree, SourceText) TryParseTree(
            string stringText, RegexOptions options,
            bool conversionFailureOk,
            bool allowIndexOutOfRange,
            bool allowNullReference,
            bool allowOutOfMemory,
            bool allowDiagnosticsMismatch = false)
        {
            var (token, tree, allChars) = JustParseTree(stringText, options, conversionFailureOk);
            if (tree == null)
            {
                Assert.True(allChars.IsDefault);
                return default;
            }
 
            CheckInvariants(tree, allChars);
            var sourceText = token.SyntaxTree.GetText();
            var treeAndText = (tree, sourceText);
 
            Regex regex = null;
            try
            {
                regex = new Regex(token.ValueText, options);
            }
            catch (IndexOutOfRangeException) when (allowIndexOutOfRange)
            {
                // bug with .NET regex parser.  Can happen with patterns like: (?<-0
                Assert.NotEmpty(tree.Diagnostics);
                return treeAndText;
            }
            catch (NullReferenceException) when (allowNullReference)
            {
                // bug with .NET regex parser.  can happen with patterns like: (?(?S))
                return treeAndText;
            }
            catch (OutOfMemoryException) when (allowOutOfMemory)
            {
                // bug with .NET regex parser.  can happen with patterns like: a{2147483647,}
                return treeAndText;
            }
            catch (ArgumentException ex)
            {
                if (!allowDiagnosticsMismatch)
                {
                    Assert.NotEmpty(tree.Diagnostics);
 
                    // Ensure the diagnostic we emit is the same as the .NET one. Note: we can only
                    // do this in en-US as that's the only culture where we control the text exactly
                    // and can ensure it exactly matches Regex.  We depend on localization to do a
                    // good enough job here for other languages.
                    if (Thread.CurrentThread.CurrentCulture.Name == "en-US")
                    {
                        Assert.True(tree.Diagnostics.Any(d => ex.Message.Contains(d.Message)));
                    }
                }
 
                return treeAndText;
            }
 
            if (!tree.Diagnostics.IsEmpty && !allowDiagnosticsMismatch)
            {
                var expectedDiagnostics = CreateDiagnosticsElement(sourceText, tree);
                Assert.False(true, "Expected diagnostics: \r\n" + expectedDiagnostics.ToString().Replace(@"""", @""""""));
            }
 
            Assert.True(regex.GetGroupNumbers().OrderBy(v => v).SequenceEqual(
                tree.CaptureNumbersToSpan.Keys.OrderBy(v => v)));
 
            Assert.True(regex.GetGroupNames().Where(v => !int.TryParse(v, out _)).OrderBy(v => v).SequenceEqual(
                tree.CaptureNamesToSpan.Keys.OrderBy(v => v)));
 
            return treeAndText;
        }
 
        private static string TreeToText(SourceText text, RegexTree tree)
        {
            var element = new XElement("Tree",
                NodeToElement(tree.Root));
 
            if (tree.Diagnostics.Length > 0)
            {
                element.Add(CreateDiagnosticsElement(text, tree));
            }
 
            element.Add(new XElement("Captures",
                tree.CaptureNumbersToSpan.OrderBy(kvp => kvp.Key).Select(kvp =>
                    new XElement("Capture", new XAttribute("Name", kvp.Key), new XAttribute("Span", kvp.Value), GetTextAttribute(text, kvp.Value))),
                tree.CaptureNamesToSpan.OrderBy(kvp => kvp.Key).Select(kvp =>
                    new XElement("Capture", new XAttribute("Name", kvp.Key), new XAttribute("Span", kvp.Value), GetTextAttribute(text, kvp.Value)))));
 
            return element.ToString();
        }
 
        private static XElement CreateDiagnosticsElement(SourceText text, RegexTree tree)
            => new XElement("Diagnostics",
                tree.Diagnostics.Select(d =>
                    new XElement("Diagnostic",
                        new XAttribute("Message", d.Message),
                        new XAttribute("Span", d.Span),
                        GetTextAttribute(text, d.Span))));
 
        private static XAttribute GetTextAttribute(SourceText text, TextSpan span)
            => new("Text", text.ToString(span));
 
        private static XElement NodeToElement(RegexNode node)
        {
            if (node is RegexAlternationNode alternationNode)
                return AlternationToElement(alternationNode, alternationNode.SequenceList.NodesAndTokens.Length);
 
            var element = new XElement(node.Kind.ToString());
            foreach (var child in node)
                element.Add(child.IsNode ? NodeToElement(child.Node) : TokenToElement(child.Token));
 
            return element;
        }
 
        private static XElement AlternationToElement(RegexAlternationNode alternationNode, int end)
        {
            // to keep tests in sync with how we used to structure alternations, we specially handle this node.
            // First, if the node only has a single element, then just print that element as that's what would
            // normally be inlined into the parent.
            if (end == 1)
                return NodeToElement(alternationNode.SequenceList.NodesAndTokens[0].Node);
 
            var element = new XElement(alternationNode.Kind.ToString());
            element.Add(AlternationToElement(alternationNode, end - 2));
            element.Add(TokenToElement(alternationNode.SequenceList.NodesAndTokens[end - 2].Token));
            element.Add(NodeToElement(alternationNode.SequenceList.NodesAndTokens[end - 1].Node));
            return element;
        }
 
        private static XElement TokenToElement(RegexToken token)
        {
            var element = new XElement(token.Kind.ToString());
 
            if (token.Value != null)
            {
                element.Add(new XAttribute("value", token.Value));
            }
 
            if (token.LeadingTrivia.Length > 0)
            {
                element.Add(new XElement("Trivia", token.LeadingTrivia.Select(t => TriviaToElement(t))));
            }
 
            if (token.VirtualChars.Length > 0)
            {
                element.Add(token.VirtualChars.CreateString());
            }
 
            return element;
        }
 
        private static XElement TriviaToElement(RegexTrivia trivia)
            => new XElement(
                trivia.Kind.ToString(),
                trivia.VirtualChars.CreateString());
 
        private static void CheckInvariants(RegexTree tree, VirtualCharSequence allChars)
        {
            var root = tree.Root;
            var position = 0;
            CheckInvariants(root, ref position, allChars);
            Assert.Equal(allChars.Length, position);
        }
 
        private static void CheckInvariants(RegexNode node, ref int position, VirtualCharSequence allChars)
        {
            foreach (var child in node)
            {
                if (child.IsNode)
                {
                    CheckInvariants(child.Node, ref position, allChars);
                }
                else
                {
                    CheckInvariants(child.Token, ref position, allChars);
                }
            }
        }
 
        private static void CheckInvariants(RegexToken token, ref int position, VirtualCharSequence allChars)
        {
            CheckInvariants(token.LeadingTrivia, ref position, allChars);
            CheckCharacters(token.VirtualChars, ref position, allChars);
        }
 
        private static void CheckInvariants(ImmutableArray<RegexTrivia> leadingTrivia, ref int position, VirtualCharSequence allChars)
        {
            foreach (var trivia in leadingTrivia)
            {
                CheckInvariants(trivia, ref position, allChars);
            }
        }
 
        private static void CheckInvariants(RegexTrivia trivia, ref int position, VirtualCharSequence allChars)
        {
            switch (trivia.Kind)
            {
                case RegexKind.CommentTrivia:
                case RegexKind.WhitespaceTrivia:
                    break;
                default:
                    Assert.False(true, "Incorrect trivia kind");
                    return;
            }
 
            CheckCharacters(trivia.VirtualChars, ref position, allChars);
        }
 
        private static void CheckCharacters(VirtualCharSequence virtualChars, ref int position, VirtualCharSequence allChars)
        {
            for (var i = 0; i < virtualChars.Length; i++)
            {
                Assert.Equal(allChars[position + i], virtualChars[i]);
            }
 
            position += virtualChars.Length;
        }
 
        private static string And(params string[] regexes)
        {
            var conj = $"({regexes[^1]})";
            for (var i = regexes.Length - 2; i >= 0; i--)
                conj = $"(?({regexes[i]}){conj}|[0-[0]])";
 
            return conj;
        }
 
        private static string Not(string regex)
            => $"(?({regex})[0-[0]]|.*)";
 
        [Fact]
        public void TestDeepRecursion()
        {
            var (token, tree, chars) =
                JustParseTree(
@"@""((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((""", RegexOptions.None, conversionFailureOk: false);
            Assert.False(token.IsMissing);
            Assert.False(chars.IsDefaultOrEmpty);
            Assert.Null(tree);
        }
 
        [Fact]
        public void TestNoStackOverflow()
        {
            for (var i = 1; i < 1200; i++)
            {
                var text = new string('(', i);
                var (token, _, chars) = JustParseTree($@"@""{text}""", RegexOptions.None, conversionFailureOk: false);
                Assert.False(token.IsMissing);
                Assert.False(chars.IsDefaultOrEmpty);
            }
        }
    }
}