File: Grammar\GrammarGenerator.cs
Web Access
Project: src\src\Tools\Source\CompilerGeneratorTools\Source\CSharpSyntaxGenerator\CSharpSyntaxGenerator.csproj (CSharpSyntaxGenerator)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
#nullable disable
 
// We only support grammar generation in the command line version for now which is the netcoreapp target
#if NET
 
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.CodeAnalysis.CSharp;
 
namespace CSharpSyntaxGenerator.Grammar
{
    internal static class GrammarGenerator
    {
        public static string Run(List<TreeType> types)
        {
            var rules = types.ToDictionary(n => n.Name, _ => new List<Production>());
            foreach (var type in types)
            {
                if (type.Base != null && rules.TryGetValue(type.Base, out var productions))
                    productions.Add(RuleReference(type.Name));
 
                if (type is Node && type.Children.Count > 0)
                {
                    // Convert rules like `a: (x | y) ...` into:
                    //
                    // a: x ...
                    //  | y ...;
                    //
                    // Note: if we have `a: (a1 | b1) ... (ax | bx) presume that that's a paired construct and generate:
                    //
                    // a: a1 ... ax
                    //  | b1 ... bx;
 
                    if (type.Children.First() is Field firstField && firstField.Kinds.Count > 0)
                    {
                        var originalFirstFieldKinds = firstField.Kinds.ToList();
                        if (type.Children.Count >= 2 && type.Children.Last() is Field lastField && lastField.Kinds.Count == firstField.Kinds.Count)
                        {
                            var originalLastFieldKinds = lastField.Kinds.ToList();
                            for (int i = 0; i < originalFirstFieldKinds.Count; i++)
                            {
                                firstField.Kinds = [originalFirstFieldKinds[i]];
                                lastField.Kinds = [originalLastFieldKinds[i]];
                                rules[type.Name].Add(Sequence(type.Children.Select(ToProduction)));
                            }
                        }
                        else
                        {
                            for (int i = 0; i < originalFirstFieldKinds.Count; i++)
                            {
                                firstField.Kinds = [originalFirstFieldKinds[i]];
                                rules[type.Name].Add(Sequence(type.Children.Select(ToProduction)));
                            }
                        }
                    }
                    else
                    {
                        rules[type.Name].Add(Sequence(type.Children.Select(ToProduction)));
                    }
                }
            }
 
            // Add some rules not present in Syntax.xml.
            AddLexicalRules(rules);
 
            // The grammar will bottom out with certain lexical productions. Create rules for these.
            var lexicalRules = rules.Values.SelectMany(ps => ps).SelectMany(p => p.ReferencedRules)
                .Where(r => !rules.TryGetValue(r, out var productions) || productions.Count == 0).ToArray();
            foreach (var name in lexicalRules)
                rules[name] = [new("/* see lexical specification */")];
 
            var seen = new HashSet<string>();
 
            // Define a few major sections to help keep the grammar file naturally grouped.
            List<string> majorRules = [
                "CompilationUnitSyntax",
                "MemberDeclarationSyntax",
                "TypeSyntax",
                "StatementSyntax",
                "ExpressionSyntax",
                "XmlNodeSyntax",
                "StructuredTriviaSyntax",
                // Place all syntax tokens at the end to keep them out of the way.
                "SyntaxToken",
                .. rules["SyntaxToken"].SelectMany(r => r.ReferencedRules)];
 
            var result = "// <auto-generated />" + Environment.NewLine + "grammar csharp;" + Environment.NewLine;
 
            // Handle each major section first and then walk any rules not hit transitively from them.
            foreach (var rule in majorRules.Concat(rules.Keys.OrderBy(a => a)))
                processRule(rule, ref result);
 
            return result;
 
            void processRule(string name, ref string result)
            {
                if (name != "CSharpSyntaxNode" && seen.Add(name))
                {
                    // Order the productions to keep us independent from whatever changes happen in Syntax.xml.
                    var sorted = rules[name].OrderBy(v => v);
                    result += Environment.NewLine + RuleReference(name).Text + Environment.NewLine + "  : " +
                        string.Join(Environment.NewLine + "  | ", sorted) + Environment.NewLine + "  ;" + Environment.NewLine;
 
                    // Now proceed in depth-first fashion through the referenced rules to keep related rules
                    // close by. Don't recurse into major-sections to help keep them separated in grammar file.
                    foreach (var production in sorted)
                        foreach (var referencedRule in production.ReferencedRules)
                            if (!majorRules.Concat(lexicalRules).Contains(referencedRule))
                                processRule(referencedRule, ref result);
                }
            }
        }
 
        private static void AddLexicalRules(Dictionary<string, List<Production>> rules)
        {
            addUtf8Rules();
            addTokenRules();
            addIdentifierRules();
            addRealLiteralRules();
            addNumericLiteralRules();
            addIntegerLiteralRules();
            addEscapeSequenceRules();
            addStringLiteralRules();
            addCharacterLiteralRules();
 
            void addUtf8Rules()
            {
                var utf8Suffix = Choice(anyCasing("U8"));
                rules.Add("Utf8StringLiteralToken", [Sequence([RuleReference("StringLiteralToken"), utf8Suffix])]);
                rules.Add("Utf8MultiLineRawStringLiteralToken", [Sequence([RuleReference("MultiLineRawStringLiteralToken"), utf8Suffix])]);
                rules.Add("Utf8SingleLineRawStringLiteralToken", [Sequence([RuleReference("SingleLineRawStringLiteralToken"), utf8Suffix])]);
            }
 
            void addTokenRules()
            {
                rules["SyntaxToken"].AddRange([RuleReference("IdentifierToken"), RuleReference("Keyword"), RuleReference("NumericLiteralToken"), RuleReference("CharacterLiteralToken"), RuleReference("StringLiteralToken"), RuleReference("OperatorToken"), RuleReference("PunctuationToken")]);
 
                var modifierWords = GetMembers<DeclarationModifiers>()
                    .Where(n => GetSyntaxKind(n + "Keyword") != SyntaxKind.None)
                    .Select(n => n.ToString().ToLower());
                rules.Add("Modifier", JoinWords(modifierWords));
 
                var keywords = JoinWords(GetMembers<SyntaxKind>().Where(k => SyntaxFacts.IsReservedKeyword(k)).Select(SyntaxFacts.GetText).Where(t => !modifierWords.Contains(t)));
                keywords.Add(RuleReference("Modifier"));
                rules.Add("Keyword", keywords);
 
                var operatorTokens = GetMembers<SyntaxKind>().Where(m => SyntaxFacts.IsBinaryExpressionOperatorToken(m) || SyntaxFacts.IsPostfixUnaryExpression(m) || SyntaxFacts.IsPrefixUnaryExpression(m) || SyntaxFacts.IsAssignmentExpressionOperatorToken(m));
                rules.Add("OperatorToken", JoinWords(operatorTokens.Select(SyntaxFacts.GetText)));
 
                rules.Add("PunctuationToken", JoinWords(GetMembers<SyntaxKind>()
                    .Where(m => SyntaxFacts.IsLanguagePunctuation(m) && !operatorTokens.Contains(m) && !m.ToString().StartsWith("Xml"))
                    .Select(SyntaxFacts.GetText)));
            }
 
            void addIdentifierRules()
            {
                rules.Add("IdentifierToken", [Sequence([Text("@").Optional, RuleReference("IdentifierStartCharacter"), RuleReference("IdentifierPartCharacter")])]);
                rules.Add("IdentifierStartCharacter", [RuleReference("LetterCharacter"), RuleReference("UnderscoreCharacter")]);
                rules.Add("IdentifierPartCharacter", [RuleReference("LetterCharacter"), RuleReference("DecimalDigitCharacter"), RuleReference("ConnectingCharacter"), RuleReference("CombiningCharacter"), RuleReference("FormattingCharacter")]);
                rules.Add("UnderscoreCharacter", [Text("_"), new("""'\\u005' /* unicode_escape_sequence for underscore */""")]);
                rules.Add("LetterCharacter", [
                    new("""/* [\p{L}\p{Nl}] category letter, all subcategories; category number, subcategory letter */"""),
                    new("unicode_escape_sequence /* only escapes for categories L & Nl allowed */")]);
 
                rules.Add("CombiningCharacter", [
                    new("""/* [\p{Mn}\p{Mc}] category Mark, subcategories non-spacing and spacing combining */"""),
                    new("unicode_escape_sequence /* only escapes for categories Mn & Mc allowed */")]);
 
                rules.Add("DecimalDigitCharacter", [
                    new("""/* [\p{Nd}] category number, subcategory decimal digit */"""),
                    new("unicode_escape_sequence /* only escapes for category Nd allowed */")]);
 
                rules.Add("ConnectingCharacter", [
                    new("""/* [\p{Pc}] category Punctuation, subcategory connector */"""),
                    new("unicode_escape_sequence /* only escapes for category Pc allowed */")]);
 
                rules.Add("FormattingCharacter", [
                    new("""/* [\p{Cf}] category Other, subcategory format. */"""),
                    new("unicode_escape_sequence /* only escapes for category Cf allowed */")]);
            }
 
            void addRealLiteralRules()
            {
                var decimalDigitPlus = RuleReference("DecimalDigit").OneOrMany;
                var exponentPart = RuleReference("ExponentPart");
                var exponentPartOpt = exponentPart.Optional;
                var realTypeSuffix = RuleReference("RealTypeSuffix");
                var realTypeSuffixOpt = realTypeSuffix.Optional;
 
                rules.Add("RealLiteralToken", [
                    Sequence([decimalDigitPlus, Text("."), decimalDigitPlus, exponentPartOpt, realTypeSuffixOpt]),
                    Sequence([Text("."), decimalDigitPlus, exponentPartOpt, realTypeSuffixOpt]),
                    Sequence([decimalDigitPlus, exponentPart, realTypeSuffixOpt]),
                    Sequence([decimalDigitPlus, realTypeSuffix]),
                ]);
 
                rules.Add("ExponentPart", [Sequence([Choice(anyCasing("E")), Choice([Text("+"), Text("-")]).Optional, decimalDigitPlus])]);
                rules.Add("RealTypeSuffix", [.. anyCasing("F"), .. anyCasing("D"), .. anyCasing("M")]);
            }
 
            void addNumericLiteralRules()
            {
                rules.Add("NumericLiteralToken", [RuleReference("IntegerLiteralToken"), RuleReference("RealLiteralToken")]);
            }
 
            void addIntegerLiteralRules()
            {
                var decimalDigit = RuleReference("DecimalDigit");
                var decimalDigitPlus = decimalDigit.OneOrMany;
                var integerTypeSuffixOpt = RuleReference("IntegerTypeSuffix").Optional;
 
                rules.Add("IntegerLiteralToken", [RuleReference("DecimalIntegerLiteralToken"), RuleReference("HexadecimalIntegerLiteralToken")]);
                rules.Add("DecimalIntegerLiteralToken", [Sequence([decimalDigitPlus, integerTypeSuffixOpt])]);
                rules.Add("IntegerTypeSuffix", [.. anyCasing("U"), .. anyCasing("L"), .. anyCasing("UL"), .. anyCasing("LU")]);
                rules.Add("DecimalDigit", [.. productionRange('0', '9')]);
                rules.Add("HexadecimalDigit", [decimalDigit, .. productionRange('A', 'F'), .. productionRange('a', 'f')]);
                rules.Add("HexadecimalIntegerLiteralToken", [Sequence([Choice([Text("0x"), Text("0X")]), RuleReference("HexadecimalDigit").OneOrMany, integerTypeSuffixOpt])]);
            }
 
            void addEscapeSequenceRules()
            {
                var hexDigit = RuleReference("HexadecimalDigit");
                var hexDigitOpt = hexDigit.Optional;
 
                rules.Add("SimpleEscapeSequence", [Text(@"\'"), Text(@"\"""), Text(@"\\"), Text(@"\0"), Text(@"\a"), Text(@"\b"), Text(@"\f"), Text(@"\n"), Text(@"\r"), Text(@"\t"), Text(@"\v")]);
                rules.Add("HexadecimalEscapeSequence", [Sequence([Text(@"\x"), hexDigit, .. repeat(hexDigitOpt, 3)])]);
                rules.Add("UnicodeEscapeSequence", [Sequence([Text(@"\u"), .. repeat(hexDigit, 4)]), Sequence([Text(@"\U"), .. repeat(hexDigit, 8)])]);
            }
 
            void addStringLiteralRules()
            {
                rules.Add("StringLiteralToken", [RuleReference("RegularStringLiteralToken"), RuleReference("VerbatimStringLiteralToken")]);
 
                rules.Add("RegularStringLiteralToken", [Sequence([Text("\""), RuleReference("RegularStringLiteralCharacter").ZeroOrMany, Text("\"")])]);
                rules.Add("RegularStringLiteralCharacter", [RuleReference("SingleRegularStringLiteralCharacter"), RuleReference("SimpleEscapeSequence"), RuleReference("HexadecimalEscapeSequence"), RuleReference("UnicodeEscapeSequence")]);
                rules.Add("SingleRegularStringLiteralCharacter", [new("""/* ~["\\\u000D\u000A\u0085\u2028\u2029] anything but ", \, and new_line_character */""")]);
 
                rules.Add("VerbatimStringLiteralToken", [Sequence([Text("@\""), RuleReference("VerbatimStringLiteralCharacter").ZeroOrMany, Text("\"")])]);
                rules.Add("VerbatimStringLiteralCharacter", [RuleReference("SingleVerbatimStringLiteralCharacter"), RuleReference("QuoteEscapeSequence")]);
                rules.Add("SingleVerbatimStringLiteralCharacter", [new("/* anything but quotation mark (U+0022) */")]);
 
                rules.Add("QuoteEscapeSequence", [Text("\"\"")]);
 
                rules.Add("InterpolatedMultiLineRawStringStartToken", [new(""""'$'+ '"""' '"'*"""")]);
                rules.Add("InterpolatedRawStringEndToken", [new(""""'"""' '"'* /* must match number of quotes in raw_string_start_token */"""")]);
                rules.Add("InterpolatedSingleLineRawStringStartToken", [new(""""'$'+ '"""' '"'*"""")]);
            }
 
            void addCharacterLiteralRules()
            {
                rules.Add("CharacterLiteralToken", [Sequence([Text("'"), RuleReference("Character"), Text("'")])]);
                rules.Add("Character", [RuleReference("SingleCharacter"), RuleReference("SimpleEscapeSequence"), RuleReference("HexadecimalEscapeSequence"), RuleReference("UnicodeEscapeSequence")]);
                rules.Add("SingleCharacter", [new("""/* ~['\\\u000D\u000A\u0085\u2028\u2029] anything but ', \\, and new_line_character */""")]);
            }
 
            IEnumerable<Production> productionRange(char start, char end)
            {
                for (char c = start; c <= end; c++)
                    yield return Text($"{c}");
            }
 
            IEnumerable<Production> repeat(Production production, int count)
                => Enumerable.Repeat(production, count);
 
            IEnumerable<Production> anyCasing(string value)
            {
                var array = value.Select(c => char.IsLetter(c) ? [char.ToUpperInvariant(c), char.ToLowerInvariant(c)] : new[] { c }).ToArray();
 
                var indices = new int[array.Length];
                var builder = new StringBuilder();
                do
                {
                    for (var i = 0; i < value.Length; i++)
                        builder.Append(array[i][indices[i]]);
 
                    yield return Text($"{builder}");
                    builder.Clear();
 
                    for (var i = 0; i < indices.Length; i++)
                    {
                        if (++indices[i] < array[i].Length)
                            break;
 
                        indices[i] = 0;
                    }
                }
                while (!indices.All(i => i == 0));
            }
        }
 
        private static List<Production> JoinWords(IEnumerable<string> strings)
            => strings.Select(s => new Production($"""'{Escape(s)}'""")).ToList();
 
        private static string Escape(string s)
            => s.Replace(@"\", @"\\").Replace("'", @"\'");
 
        private static Production Text(string value)
            => new($"'{Escape(value)}'");
 
        private static Production Join(IEnumerable<Production> productions, string delim)
            => new(string.Join(delim, productions.Where(p => p.Text.Length > 0)), productions.SelectMany(p => p.ReferencedRules));
 
        private static Production ToProduction(TreeTypeChild child)
            => child switch
            {
                Choice c => Choice(c.Children.Select(ToProduction)).Suffix("?", when: c.Optional),
                Sequence s => Sequence(s.Children.Select(ToProduction)).Parenthesize(),
                Field f => HandleField(f).Suffix("?", when: f.IsOptional),
                _ => throw new InvalidOperationException(),
            };
 
        private static Production Choice(IEnumerable<Production> productions, bool parenthesize = true)
            => Join(productions, " | ").Parenthesize(parenthesize);
 
        private static Production Sequence(IEnumerable<Production> productions)
            => Join(productions, " ");
 
        private static Production HandleField(Field field)
            // 'bool' fields are for a few properties we generate on DirectiveTrivia. They're not
            // relevant to the grammar, so we just return an empty production to ignore them.
            => field.Type == "bool" ? new Production("") :
               field.Type == "CSharpSyntaxNode" ? RuleReference(field.Kinds.Single().Name + "Syntax") :
               field.Type.StartsWith("SeparatedSyntaxList") ? HandleSeparatedList(field, field.Type[("SeparatedSyntaxList".Length + 1)..^1]) :
               field.Type.StartsWith("SyntaxList") ? HandleList(field, field.Type[("SyntaxList".Length + 1)..^1]) :
               field.IsToken ? HandleTokenField(field) : RuleReference(field.Type);
 
        private static Production HandleSeparatedList(Field field, string elementType)
            => RuleReference(elementType).Suffix(" (',' " + RuleReference(elementType) + ")")
                .Suffix("*", when: field.MinCount < 2).Suffix("+", when: field.MinCount >= 2)
                .Suffix(" ','?", when: field.AllowTrailingSeparator)
                .Parenthesize(when: field.MinCount == 0).Suffix("?", when: field.MinCount == 0);
 
        private static Production HandleList(Field field, string elementType)
            => (elementType != "SyntaxToken" ? RuleReference(elementType) :
                field.Name == "Commas" ? Text(",") :
                field.Name == "Modifiers" ? RuleReference("Modifier") :
                field.Name == "TextTokens" ? RuleReference(nameof(SyntaxKind.XmlTextLiteralToken)) : RuleReference(elementType))
                    .Suffix(field.MinCount == 0 ? "*" : "+");
 
        private static Production HandleTokenField(Field field)
            => field.Kinds.Count == 0
                ? HandleTokenName(field.Name)
                : Choice(field.Kinds.Select(k => HandleTokenName(k.Name)), parenthesize: field.Kinds.Count >= 2);
 
        private static Production HandleTokenName(string tokenName)
            => GetSyntaxKind(tokenName) is var kind && kind == SyntaxKind.None ? RuleReference("SyntaxToken") :
               SyntaxFacts.GetText(kind) is var text && text != "" ? Text(text) :
               tokenName.StartsWith("EndOf") ? new Production("") :
               tokenName.StartsWith("Omitted") ? new Production("/* epsilon */") : RuleReference(tokenName);
 
        private static SyntaxKind GetSyntaxKind(string name)
            => GetMembers<SyntaxKind>().Where(k => k.ToString() == name).SingleOrDefault();
 
        private static IEnumerable<TEnum> GetMembers<TEnum>() where TEnum : struct, Enum
            => (IEnumerable<TEnum>)Enum.GetValues(typeof(TEnum));
 
        private static Production RuleReference(string name)
            => new(
                s_normalizationRegex.Replace(name.EndsWith("Syntax") ? name[..^"Syntax".Length] : name, "_").ToLower(),
                ImmutableArray.Create(name));
 
        // Converts a PascalCased name into snake_cased name.
        private static readonly Regex s_normalizationRegex = new(
            "(?<=[A-Z])(?=[A-Z][a-z0-9]) | (?<=[^A-Z])(?=[A-Z]) | (?<=[A-Za-z0-9])(?=[^A-Za-z0-9])",
            RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
    }
 
    internal readonly struct Production(
        string text, IEnumerable<string> referencedRules = null) : IComparable<Production>
    {
        public readonly string Text = text;
        public readonly ImmutableArray<string> ReferencedRules = referencedRules?.ToImmutableArray() ?? ImmutableArray<string>.Empty;
 
        public override string ToString() => Text;
        public int CompareTo(Production other) => StringComparer.OrdinalIgnoreCase.Compare(this.Text, other.Text);
        public Production Prefix(string prefix) => new(prefix + this, ReferencedRules);
        public Production Suffix(string suffix, bool when = true) => when ? new(this + suffix, ReferencedRules) : this;
        public Production Parenthesize(bool when = true) => when ? Prefix("(").Suffix(")") : this;
        public Production Optional => Suffix("?");
        public Production ZeroOrMany => Suffix("*");
        public Production OneOrMany => Suffix("+");
    }
}
 
namespace Microsoft.CodeAnalysis
{
    internal static class GreenNode
    {
        internal const int ListKind = 1; // See SyntaxKind.
    }
}
 
#endif