File: EmbeddedLanguages\RegularExpressions\RegexParser.CaptureInfoAnalyzer.cs
Web Access
Project: src\src\Features\Core\Portable\Microsoft.CodeAnalysis.Features.csproj (Microsoft.CodeAnalysis.Features)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
#nullable disable
 
using System.Collections.Immutable;
using System.Diagnostics;
using System.Text.RegularExpressions;
using Microsoft.CodeAnalysis.EmbeddedLanguages.Common;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Text;
 
namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions;
 
using static EmbeddedSyntaxHelpers;
using static RegexHelpers;
 
using RegexToken = EmbeddedSyntaxToken<RegexKind>;
 
internal partial struct RegexParser
{
    /// <summary>
    /// Analyzes the first parsed tree to determine the set of capture numbers and names.  These are
    /// then used to do the second parsing pass as they can change how the regex engine interprets
    /// some parts of the pattern (though not the groups themselves).
    /// </summary>
    private struct CaptureInfoAnalyzer
    {
        private readonly VirtualCharSequence _text;
        private readonly ImmutableDictionary<int, TextSpan>.Builder _captureNumberToSpan;
        private readonly ImmutableDictionary<string, TextSpan>.Builder _captureNameToSpan;
        private readonly ArrayBuilder<string> _captureNames;
        private int _autoNumber;
        private int _recursionDepth;
 
        private CaptureInfoAnalyzer(VirtualCharSequence text)
        {
            _text = text;
            _captureNumberToSpan = ImmutableDictionary.CreateBuilder<int, TextSpan>();
            _captureNameToSpan = ImmutableDictionary.CreateBuilder<string, TextSpan>();
            _captureNames = ArrayBuilder<string>.GetInstance();
            _autoNumber = 1;
            _recursionDepth = 0;
 
            _captureNumberToSpan.Add(0, text.IsEmpty ? default : GetSpan(text));
        }
 
        public static (ImmutableDictionary<string, TextSpan>, ImmutableDictionary<int, TextSpan>) Analyze(
            VirtualCharSequence text, RegexCompilationUnit root, RegexOptions options)
        {
            var analyzer = new CaptureInfoAnalyzer(text);
            return analyzer.Analyze(root, options);
        }
 
        private (ImmutableDictionary<string, TextSpan>, ImmutableDictionary<int, TextSpan>) Analyze(
            RegexCompilationUnit root, RegexOptions options)
        {
            CollectCaptures(root, options);
            AssignNumbersToCaptureNames();
 
            _captureNames.Free();
            return (_captureNameToSpan.ToImmutable(), _captureNumberToSpan.ToImmutable());
        }
 
        private void CollectCaptures(RegexNode node, RegexOptions options)
        {
            try
            {
                _recursionDepth++;
                StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
                CollectCapturesWorker(node, options);
            }
            finally
            {
                _recursionDepth--;
            }
        }
 
        private void CollectCapturesWorker(RegexNode node, RegexOptions options)
        {
            switch (node.Kind)
            {
                case RegexKind.CaptureGrouping:
                    var captureGrouping = (RegexCaptureGroupingNode)node;
                    RecordCapture(captureGrouping.CaptureToken, GetGroupingSpan(captureGrouping));
                    break;
 
                case RegexKind.BalancingGrouping:
                    var balancingGroup = (RegexBalancingGroupingNode)node;
                    RecordCapture(balancingGroup.FirstCaptureToken, GetGroupingSpan(balancingGroup));
                    break;
 
                case RegexKind.ConditionalExpressionGrouping:
                    // Explicitly recurse into conditionalGrouping.Grouping.  That grouping
                    // itself does not create a capture group, but nested groupings inside of it
                    // will.
                    var conditionalGrouping = (RegexConditionalExpressionGroupingNode)node;
                    RecurseIntoChildren(conditionalGrouping.Grouping, options);
                    CollectCaptures(conditionalGrouping.Result, options);
                    return;
 
                case RegexKind.SimpleGrouping:
                    RecordSimpleGroupingCapture((RegexSimpleGroupingNode)node, options);
                    break;
 
                case RegexKind.NestedOptionsGrouping:
                    // When we see (?opts:...)
                    // Recurse explicitly, setting the new options as we process the inner expression.
                    // When this pops out we'll be back to these options we're currently at now.
                    var nestedOptions = (RegexNestedOptionsGroupingNode)node;
                    CollectCaptures(nestedOptions.Expression, GetNewOptionsFromToken(options, nestedOptions.OptionsToken));
                    return;
            }
 
            RecurseIntoChildren(node, options);
        }
 
        private void RecurseIntoChildren(RegexNode node, RegexOptions options)
        {
            foreach (var child in node)
            {
                if (child.IsNode)
                {
                    // When we see a SimpleOptionsGroup ```(?opts)``` then determine what the options will
                    // be for successive nodes in the sequence.
                    var childNode = child.Node;
                    if (childNode is RegexSimpleOptionsGroupingNode simpleOptions)
                    {
                        options = GetNewOptionsFromToken(options, simpleOptions.OptionsToken);
                    }
 
                    CollectCaptures(child.Node, options);
                }
            }
        }
 
        private readonly TextSpan GetGroupingSpan(RegexGroupingNode grouping)
        {
            Debug.Assert(!grouping.OpenParenToken.IsMissing);
            var lastChar = grouping.CloseParenToken.IsMissing
                ? _text.Last()
                : grouping.CloseParenToken.VirtualChars.Last();
 
            return GetSpan(grouping.OpenParenToken.VirtualChars[0], lastChar);
        }
 
        private void RecordSimpleGroupingCapture(RegexSimpleGroupingNode node, RegexOptions options)
        {
            if (HasOption(options, RegexOptions.ExplicitCapture))
            {
                // Don't automatically add simple groups if the explicit capture option is on.
                // Only add captures for 'CaptureGrouping' and 'BalancingGrouping' nodes.
                return;
            }
 
            // Don't count a bogus (? node as a capture node.  We only have this to keep our error
            // messages in line with the native parser.  i.e. even though the bogus (? code would 
            // cause an exception, we might get an earlier exception if there's a reference to
            // this grouping.  So if we note this grouping we'll end up not causing that error
            // to happen, bringing out behavior out of sync with the native system.
            var expr = node.Expression;
            if (expr is RegexAlternationNode alternation)
                expr = alternation.SequenceList[0];
 
            if (expr is RegexSequenceNode sequence &&
                sequence.ChildCount > 0)
            {
                var leftMost = sequence.ChildAt(0);
                if (leftMost.Node is RegexTextNode textNode &&
                    IsTextChar(textNode.TextToken, '?'))
                {
                    return;
                }
            }
 
            AddIfMissing(_captureNumberToSpan, list: null, _autoNumber++, GetGroupingSpan(node));
        }
 
        private readonly void RecordCapture(RegexToken token, TextSpan span)
        {
            if (!token.IsMissing)
            {
                if (token.Kind == RegexKind.NumberToken)
                {
                    AddIfMissing(_captureNumberToSpan, list: null, (int)token.Value, span);
                }
                else
                {
                    AddIfMissing(_captureNameToSpan, list: _captureNames, (string)token.Value, span);
                }
            }
        }
 
        private static void AddIfMissing<T>(
            ImmutableDictionary<T, TextSpan>.Builder mapping,
            ArrayBuilder<T> list,
            T val, TextSpan span)
        {
            if (!mapping.ContainsKey(val))
            {
                mapping.Add(val, span);
                list?.Add(val);
            }
        }
 
        /// <summary>
        /// Give numbers to all named captures.  They will get successive <see
        /// cref="_autoNumber"/> values that have not already been handed out to existing
        /// numbered capture groups.
        /// </summary>
        private void AssignNumbersToCaptureNames()
        {
            foreach (var name in _captureNames)
            {
                while (_captureNumberToSpan.ContainsKey(_autoNumber))
                {
                    _autoNumber++;
                }
 
                _captureNumberToSpan.Add(_autoNumber, _captureNameToSpan[name]);
                _autoNumber++;
            }
        }
    }
}