File: RegexGenerator.cs
Web Access
Project: src\src\libraries\System.Text.RegularExpressions\gen\System.Text.RegularExpressions.Generator.csproj (System.Text.RegularExpressions.Generator)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.CodeDom.Compiler;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
 
[assembly: System.Resources.NeutralResourcesLanguage("en-us")]
 
namespace System.Text.RegularExpressions.Generator
{
    /// <summary>Generates C# source code to implement regular expressions.</summary>
    [Generator(LanguageNames.CSharp)]
    public partial class RegexGenerator : IIncrementalGenerator
    {
        /// <summary>Name of the type emitted to contain helpers used by the generated code.</summary>
        private const string HelpersTypeName = "Utilities";
        /// <summary>Namespace containing all the generated code.</summary>
        private const string GeneratedNamespace = "System.Text.RegularExpressions.Generated";
        /// <summary>Code for a [GeneratedCode] attribute to put on the top-level generated members.</summary>
        private static readonly string s_generatedCodeAttribute = $"GeneratedCodeAttribute(\"{typeof(RegexGenerator).Assembly.GetName().Name}\", \"{typeof(RegexGenerator).Assembly.GetName().Version}\")";
        /// <summary>Header comments and usings to include at the top of every generated file.</summary>
        private static readonly string[] s_headers =
        [
            "// <auto-generated/>",
            "#nullable enable",
#if DEBUG
            "#pragma warning disable CS0162 // Unreachable code",
            "#pragma warning disable CS0164 // Unreferenced label",
            "#pragma warning disable CS0219 // Variable assigned but never used",
#else
            "#pragma warning disable",
#endif
        ];
 
        internal record struct CompilationData(bool AllowUnsafe, bool CheckOverflow, LanguageVersion LanguageVersion);
 
        public void Initialize(IncrementalGeneratorInitializationContext context)
        {
            // Produces one entry per generated regex.  This may be:
            // - Diagnostic in the case of a failure that should end the compilation
            // - (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers) in the case of valid regex
            // - (RegexMethod regexMethod, string reason, Diagnostic diagnostic) in the case of a limited-support regex
            //
            // Location is threaded separately from the records so that it doesn't participate in
            // record equality — this allows the incremental pipeline to cache results by value.
            IncrementalValueProvider<(ImmutableArray<object> Results, ImmutableArray<Diagnostic> Diagnostics)> collected =
                context.SyntaxProvider
 
                // Find all MethodDeclarationSyntax nodes attributed with GeneratedRegex and gather the required information.
                // The predicate will be run once for every attributed node in the same file that's being modified.
                // The transform will be run once for every attributed node in the compilation.
                // Thus, both should do the minimal amount of work required and get out.  This should also have extracted
                // everything from the target necessary to do all subsequent analysis and should return an object that's
                // meaningfully comparable and that doesn't reference anything from the compilation: we want to ensure
                // that any successful cached results are idempotent for the input such that they don't trigger downstream work
                // if there are no changes.
                .ForAttributeWithMetadataName(
                    GeneratedRegexAttributeName,
                    (node, _) => node is MethodDeclarationSyntax or PropertyDeclarationSyntax or IndexerDeclarationSyntax or AccessorDeclarationSyntax,
                    GetRegexMethodDataOrFailureDiagnostic)
 
                // Filter out any parsing errors that resulted in null objects being returned.
                .Where(static m => m is not null)
 
                // The input here will either be a Diagnostic (in the case of something erroneous detected in GetRegexMethodDataOrFailureDiagnostic)
                // or it will be a RegexPatternAndSyntax containing all of the successfully parsed data from the attribute/method.
                // This step parses the regex tree and checks whether full code generation is supported.
                // The DiagnosticLocation is consumed here for diagnostic creation and not propagated further.
                .Select((methodOrDiagnostic, _) =>
                {
                    if (methodOrDiagnostic is RegexPatternAndSyntax method)
                    {
                        try
                        {
                            RegexTree regexTree = RegexParser.Parse(method.Pattern, method.Options | RegexOptions.Compiled, method.Culture); // make sure Compiled is included to get all optimizations applied to it
                            AnalysisResults analysis = RegexTreeAnalyzer.Analyze(regexTree);
                            RegexMethod regexMethod = new(method.DeclaringType, method.IsProperty, method.MemberName, method.Modifiers, method.NullableRegex, method.Pattern, method.Options, method.MatchTimeout, regexTree, analysis, method.CompilationData);
 
                            // If we're unable to generate a full implementation for this regex, report a diagnostic.
                            // We'll still output a limited implementation that just caches a new Regex(...).
                            if (!SupportsCodeGeneration(regexMethod, regexMethod.CompilationData.LanguageVersion, out string? reason))
                            {
                                return (object)(regexMethod, reason, Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, method.DiagnosticLocation), regexMethod.CompilationData);
                            }
 
                            return regexMethod;
                        }
                        catch (Exception e)
                        {
                            return Diagnostic.Create(DiagnosticDescriptors.InvalidRegexArguments, method.DiagnosticLocation, e.Message);
                        }
                    }
 
                    return methodOrDiagnostic;
                })
 
                // Generate the RunnerFactory for each regex, if possible.  This is where the bulk of the implementation occurs.
                .Select((state, _) =>
                {
                    if (state is not RegexMethod regexMethod)
                    {
                        Debug.Assert(state is Diagnostic or ValueTuple<RegexMethod, string, Diagnostic, CompilationData>);
                        return state;
                    }
 
                    // Generate the core logic for the regex.
                    Dictionary<string, string[]> requiredHelpers = new();
                    var sw = new StringWriter();
                    var writer = new IndentedTextWriter(sw);
                    writer.Indent += 2;
                    writer.WriteLine();
                    EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers, regexMethod.CompilationData.CheckOverflow);
                    writer.Indent -= 2;
                    return (regexMethod, sw.ToString(), requiredHelpers, regexMethod.CompilationData);
                })
 
                // Combine all of the generated text outputs into a single batch, then split
                // the source model from diagnostics so they can be emitted independently.
                .Collect()
                .Select(static (results, _) =>
                {
                    ImmutableArray<Diagnostic>.Builder? diagnostics = null;
                    ImmutableArray<object>.Builder? filteredResults = null;
 
                    foreach (object result in results)
                    {
                        if (result is Diagnostic d)
                        {
                            (diagnostics ??= ImmutableArray.CreateBuilder<Diagnostic>()).Add(d);
                        }
                        else if (result is ValueTuple<RegexMethod, string, Diagnostic, CompilationData> limitedSupportResult)
                        {
                            (diagnostics ??= ImmutableArray.CreateBuilder<Diagnostic>()).Add(limitedSupportResult.Item3);
                            (filteredResults ??= ImmutableArray.CreateBuilder<object>()).Add(
                                (limitedSupportResult.Item1, limitedSupportResult.Item2, limitedSupportResult.Item4));
                        }
                        else
                        {
                            (filteredResults ??= ImmutableArray.CreateBuilder<object>()).Add(result);
                        }
                    }
 
                    return (
                        Results: filteredResults?.ToImmutable() ?? ImmutableArray<object>.Empty,
                        Diagnostics: diagnostics?.ToImmutable() ?? ImmutableArray<Diagnostic>.Empty);
                });
 
            // Project to just the source model, discarding diagnostics.
            // ObjectImmutableArraySequenceEqualityComparer applies element-wise equality over
            // the heterogeneous result array, enabling Roslyn's incremental pipeline to skip
            // re-emitting source when the model has not changed.
            IncrementalValueProvider<ImmutableArray<object>> sourceModel =
                collected.Select(static (t, _) => t.Results).WithComparer(new ObjectImmutableArraySequenceEqualityComparer());
 
            context.RegisterSourceOutput(sourceModel, static (context, results) =>
            {
                if (results.IsEmpty)
                {
                    return;
                }
 
                // At this point we'll be emitting code.  Create a writer to hold it all.
                using StringWriter sw = new();
                using IndentedTextWriter writer = new(sw);
 
                // Add file headers and required usings.
                foreach (string header in s_headers)
                {
                    writer.WriteLine(header);
                }
                writer.WriteLine();
 
                // For every generated type, we give it an incrementally increasing ID, in order to create
                // unique type names even in situations where method names were the same, while also keeping
                // the type names short.  Note that this is why we only generate the RunnerFactory implementations
                // earlier in the pipeline... we want to avoid generating code that relies on the class names
                // until we're able to iterate through them linearly keeping track of a deterministic ID
                // used to name them.  The boilerplate code generation that happens here is minimal when compared to
                // the work required to generate the actual matching code for the regex.
                int id = 0;
 
                // To minimize generated code in the event of duplicated regexes, we only emit one derived Regex type per unique
                // expression/options/timeout.  A Dictionary<(expression, options, timeout), RegexMethod> is used to deduplicate, where the value of the
                // pair is the implementation used for the key.
                var emittedExpressions = new Dictionary<(string Pattern, RegexOptions Options, int? Timeout), RegexMethod>();
 
                // If we have any (RegexMethod regexMethod, string reason, CompilationData compilationData), these are regexes for which we have
                // limited support and need to simply output boilerplate.
                // If we have any (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers, CompilationData compilationData),
                // those are generated implementations to be emitted.  We need to gather up their required helpers.
                Dictionary<string, string[]> requiredHelpers = new();
                foreach (object? result in results)
                {
                    RegexMethod? regexMethod = null;
                    if (result is ValueTuple<RegexMethod, string, CompilationData> limitedSupportResult)
                    {
                        regexMethod = limitedSupportResult.Item1;
                    }
                    else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
                    {
                        foreach (KeyValuePair<string, string[]> helper in regexImpl.Item3)
                        {
                            if (!requiredHelpers.ContainsKey(helper.Key))
                            {
                                requiredHelpers.Add(helper.Key, helper.Value);
                            }
                        }
 
                        regexMethod = regexImpl.Item1;
                    }
 
                    if (regexMethod is not null)
                    {
                        var key = (regexMethod.Pattern, regexMethod.Options, regexMethod.MatchTimeout);
                        if (emittedExpressions.TryGetValue(key, out RegexMethod? implementation))
                        {
                            regexMethod.IsDuplicate = true;
                            regexMethod.GeneratedName = implementation.GeneratedName;
                        }
                        else
                        {
                            regexMethod.IsDuplicate = false;
                            regexMethod.GeneratedName = $"{regexMethod.MemberName}_{id++}";
                            emittedExpressions.Add(key, regexMethod);
                        }
 
                        EmitRegexPartialMethod(regexMethod, writer);
                        writer.WriteLine();
                    }
                }
 
                // At this point we've emitted all the partial method definitions, but we still need to emit the actual regex-derived implementations.
                // These are all emitted inside of our generated class.
 
                writer.WriteLine($"namespace {GeneratedNamespace}");
                writer.WriteLine($"{{");
 
                // We emit usings here now that we're inside of a namespace block and are no longer emitting code into
                // a user's partial type.  We can now rely on binding rules mapping to these usings and don't need to
                // use global-qualified names for the rest of the implementation.
                writer.WriteLine($"    using System;");
                writer.WriteLine($"    using System.Buffers;");
                writer.WriteLine($"    using System.CodeDom.Compiler;");
                writer.WriteLine($"    using System.Collections;");
                writer.WriteLine($"    using System.ComponentModel;");
                writer.WriteLine($"    using System.Globalization;");
                writer.WriteLine($"    using System.Runtime.CompilerServices;");
                writer.WriteLine($"    using System.Text.RegularExpressions;");
                writer.WriteLine($"    using System.Threading;");
                writer.WriteLine($"");
 
                // Emit each Regex-derived type.
                writer.Indent++;
                foreach (object? result in results)
                {
                    if (result is ValueTuple<RegexMethod, string, CompilationData> limitedSupportResult)
                    {
                        if (!limitedSupportResult.Item1.IsDuplicate)
                        {
                            EmitRegexLimitedBoilerplate(writer, limitedSupportResult.Item1, limitedSupportResult.Item2, limitedSupportResult.Item3.LanguageVersion);
                            writer.WriteLine();
                        }
                    }
                    else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
                    {
                        if (!regexImpl.Item1.IsDuplicate)
                        {
                            EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, regexImpl.Item4.AllowUnsafe);
                            writer.WriteLine();
                        }
                    }
                }
                writer.Indent--;
 
                // If any of the Regex-derived types asked for helper methods, emit those now.
                if (requiredHelpers.Count != 0)
                {
                    writer.Indent++;
                    writer.WriteLine($"/// <summary>Helper methods used by generated <see cref=\"Regex\"/>-derived implementations.</summary>");
                    writer.WriteLine($"[{s_generatedCodeAttribute}]");
                    writer.WriteLine($"file static class {HelpersTypeName}");
                    writer.WriteLine($"{{");
                    writer.Indent++;
                    bool sawFirst = false;
                    foreach (KeyValuePair<string, string[]> helper in requiredHelpers.OrderBy(h => h.Key, StringComparer.Ordinal))
                    {
                        if (sawFirst)
                        {
                            writer.WriteLine();
                        }
                        sawFirst = true;
 
                        foreach (string value in helper.Value)
                        {
                            writer.WriteLine(value);
                        }
                    }
                    writer.Indent--;
                    writer.WriteLine($"}}");
                    writer.Indent--;
                }
 
                writer.WriteLine($"}}");
 
                // Save out the source
                context.AddSource("RegexGenerator.g.cs", sw.ToString());
            });
 
            // Project to just the diagnostics, discarding the model. ImmutableArray<Diagnostic> does not
            // implement value equality, so Roslyn's incremental pipeline uses reference equality —
            // the callback fires on every compilation change. This is by design: diagnostic emission
            // is cheap, and we need fresh SourceLocation instances that are pragma-suppressible
            // (cf. https://github.com/dotnet/runtime/issues/92509).
            IncrementalValueProvider<ImmutableArray<Diagnostic>> diagnosticResults =
                collected.Select(static (t, _) => t.Diagnostics);
 
            context.RegisterSourceOutput(diagnosticResults, static (context, diagnostics) =>
            {
                foreach (Diagnostic diagnostic in diagnostics)
                {
                    context.ReportDiagnostic(diagnostic);
                }
            });
        }
 
        /// <summary>Determines whether the passed in node supports C# code generation.</summary>
        /// <remarks>
        // It also provides a human-readable string to explain the reason. It will be emitted by the source generator
        // as a comment into the C# code, hence there's no need to localize.
        /// </remarks>
        private static bool SupportsCodeGeneration(RegexMethod method, LanguageVersion languageVersion, [NotNullWhen(false)] out string? reason)
        {
            if (languageVersion < LanguageVersion.CSharp11)
            {
                reason = "the language version must be C# 11 or higher.";
                return false;
            }
 
            RegexNode node = method.Tree.Root;
 
            if (!node.SupportsCompilation(out reason))
            {
                // If the pattern doesn't support Compilation, then code generation won't be supported either.
                return false;
            }
 
            if (HasCaseInsensitiveBackReferences(node))
            {
                // For case-insensitive patterns, we use our internal Regex case equivalence table when doing character comparisons.
                // Most of the use of this table is done at Regex construction time by substituting all characters that are involved in
                // case conversions into sets that contain all possible characters that could match. That said, there is still one case
                // where you may need to do case-insensitive comparisons at match time which is the case for backreferences. For that reason,
                // and given the Regex case equivalence table is internal and can't be called by the source generated emitted type, if
                // the pattern contains case-insensitive backreferences, we won't try to create a source generated Regex-derived type.
                reason = "the expression contains case-insensitive backreferences which are not supported by the source generator";
                return false;
            }
 
            // If Compilation is supported and pattern doesn't have case insensitive backreferences, then code generation is supported.
            reason = null;
            return true;
 
            static bool HasCaseInsensitiveBackReferences(RegexNode node)
            {
                if (node.Kind is RegexNodeKind.Backreference && (node.Options & RegexOptions.IgnoreCase) != 0)
                {
                    return true;
                }
 
                int childCount = node.ChildCount();
                for (int i = 0; i < childCount; i++)
                {
                    // This recursion shouldn't hit issues with stack depth since this gets checked after
                    // SupportCompilation has ensured that the max depth is not greater than 40.
                    if (HasCaseInsensitiveBackReferences(node.Child(i)))
                    {
                        return true;
                    }
                }
 
                return false;
            }
        }
 
        private sealed class ObjectImmutableArraySequenceEqualityComparer : IEqualityComparer<ImmutableArray<object>>
        {
            public bool Equals(ImmutableArray<object> left, ImmutableArray<object> right)
            {
                if (left.Length != right.Length)
                {
                    return false;
                }
 
                for (int i = 0; i < left.Length; i++)
                {
                    bool areEqual = left[i] is { } leftElem
                        ? leftElem.Equals(right[i])
                        : right[i] is null;
 
                    if (!areEqual)
                    {
                        return false;
                    }
                }
 
                return true;
            }
 
            public int GetHashCode([DisallowNull] ImmutableArray<object> obj)
            {
                int hash = 0;
                for (int i = 0; i < obj.Length; i++)
                    hash = (hash, obj[i]).GetHashCode();
                return hash;
            }
        }
    }
}