RoslynCSharpTokenizer.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics;
using Microsoft.AspNetCore.Razor.PooledObjects;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using Microsoft.CodeAnalysis.Text;

using SyntaxToken = Microsoft.AspNetCore.Razor.Language.Syntax.InternalSyntax.SyntaxToken;
using SyntaxFactory = Microsoft.AspNetCore.Razor.Language.Syntax.InternalSyntax.SyntaxFactory;
using CSharpSyntaxKind = Microsoft.CodeAnalysis.CSharp.SyntaxKind;
using CSharpSyntaxToken = Microsoft.CodeAnalysis.SyntaxToken;
using CSharpSyntaxTriviaList = Microsoft.CodeAnalysis.SyntaxTriviaList;

namespace Microsoft.AspNetCore.Razor.Language.Legacy;

#pragma warning disable RSEXPERIMENTAL003 // SyntaxTokenParser is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
internal sealed class RoslynCSharpTokenizer : CSharpTokenizer
{
    private readonly SyntaxTokenParser _roslynTokenParser;
    /// <summary>
    /// The current trivia enumerator that we're parsing through. This is a tuple of the enumerator and whether it's leading trivia.
    /// When this is non-null, we're in the <see cref="RoslynCSharpTokenizerState.TriviaForCSharpToken"/> state.
    /// </summary>
    private (CSharpSyntaxTriviaList.Enumerator enumerator, bool isLeading)? _currentCSharpTokenTriviaEnumerator;
    /// <summary>
    /// Previous result checkpoints that we can reset <see cref="_roslynTokenParser"/> to. This must be an ordered list
    /// by position, where the position is the start of the token that was parsed including leading trivia, so that searching
    /// is correct when performing a reset.
    /// </summary>
    private readonly List<(int position, SyntaxTokenParser.Result result, bool isOnlyWhitespaceOnLine)> _resultCache = ListPool<(int, SyntaxTokenParser.Result, bool)>.Default.Get();

    private bool _isOnlyWhitespaceOnLine = true;

    public RoslynCSharpTokenizer(SeekableTextReader source, CSharpParseOptions parseOptions)
        : base(source)
    {
        base.CurrentState = StartState;

        _roslynTokenParser = CodeAnalysis.CSharp.SyntaxFactory.CreateTokenParser(source.SourceText, parseOptions);
    }

    protected override int StartState => (int)RoslynCSharpTokenizerState.Start;

    private new RoslynCSharpTokenizerState? CurrentState
    {
        get => (RoslynCSharpTokenizerState?)base.CurrentState;
        set => base.CurrentState = (int?)value;
    }

    public override SyntaxKind RazorCommentKind => SyntaxKind.RazorCommentLiteral;

    public override SyntaxKind RazorCommentTransitionKind => SyntaxKind.RazorCommentTransition;

    public override SyntaxKind RazorCommentStarKind => SyntaxKind.RazorCommentStar;

    internal override void StartingBlock()
    {
        _roslynTokenParser.SkipForwardTo(Source.Position);
        ResetIsOnlyWhitespaceOnLine();
    }

    private void ResetIsOnlyWhitespaceOnLine()
    {
        // Reset isOnlyWhitespaceOnLine for the new block
        _isOnlyWhitespaceOnLine = true;
        for (int i = Source.Position - 1; i >= 0; i--)
        {
            var currentChar = Source.SourceText[i];
            if (currentChar is '\n' or '\r')
            {
                break;
            }
            else if (!SyntaxFacts.IsWhitespace(currentChar))
            {
                _isOnlyWhitespaceOnLine = false;
                break;
            }
        }
    }

    internal override void EndingBlock()
    {
        // We should always be transitioning to the other parser in response to content. This either means that the CSharp parser put the token it saw back, meaning that we're
        // in the Start state, or it means that we'll have parsed a token, and be in the TriviaForCSharpToken state. If we're in the Start state, there's nothing for us to do.
        // In order to ensure that we properly handle the trailing trivia (because the other parser will handle the trailing trivia on the node we found, if any), we need to
        // reset back before the start of that node, skip the content, and reset our state back to Start for when we're called back next.
        if (CurrentState == RoslynCSharpTokenizerState.Start)
        {
            return;
        }

        Debug.Assert(CurrentState == RoslynCSharpTokenizerState.TriviaForCSharpToken, $"Unexpected state: {CurrentState}");
        Debug.Assert(_currentCSharpTokenTriviaEnumerator is (_, isLeading: false));
        Debug.Assert(_resultCache.Count > 0);

        var (_, result, isOnlyWhitespaceOnLine) = _resultCache[^1];
        _roslynTokenParser.ResetTo(result);
        _isOnlyWhitespaceOnLine = isOnlyWhitespaceOnLine;
        _resultCache.RemoveAt(_resultCache.Count - 1);
        var lastToken = result.Token;
        if (lastToken.HasLeadingTrivia)
        {
            // If the previous token did indeed have leading trivia, we need to make sure to take it into account so that any preprocessor directives are seen by the
            // roslyn token parser
            _ = GetNextResult(NextResultType.LeadingTrivia);
        }

        _roslynTokenParser.SkipForwardTo(lastToken.Span.End);
        CurrentState = RoslynCSharpTokenizerState.Start;
    }

    protected override StateResult Dispatch()
    {
        switch (CurrentState)
        {
            case RoslynCSharpTokenizerState.Start:
                return Start();
            case RoslynCSharpTokenizerState.TriviaForCSharpToken:
                return Trivia();
            case RoslynCSharpTokenizerState.Token:
                return Token();
            case RoslynCSharpTokenizerState.OnRazorCommentStar:
                return OnRazorCommentStar();
            case RoslynCSharpTokenizerState.AfterRazorCommentTransition:
                return AfterRazorCommentTransition();
            case RoslynCSharpTokenizerState.RazorCommentBody:
                return RazorCommentBody();
            case RoslynCSharpTokenizerState.StarAfterRazorCommentBody:
                return StarAfterRazorCommentBody();
            case RoslynCSharpTokenizerState.AtTokenAfterRazorCommentBody:
                Debug.Assert(_currentCSharpTokenTriviaEnumerator is not null);
                _isOnlyWhitespaceOnLine = false;
                return AtTokenAfterRazorCommentBody(nextState: (int)RoslynCSharpTokenizerState.TriviaForCSharpToken);
            default:
                Debug.Fail("Invalid TokenizerState");
                return default(StateResult);
        }
    }

    // Optimize memory allocation by returning constants for the most frequent cases
    protected override string GetTokenContent(SyntaxKind type)
    {
        Debug.Assert(type != SyntaxKind.CSharpOperator, "CSharpOperator should be handled by getting the interned text from C#");
        var tokenLength = Buffer.Length;

        if (tokenLength == 1)
        {
            switch (type)
            {
                case SyntaxKind.NewLine:
                    if (Buffer[0] == '\n')
                    {
                        return "\n";
                    }
                    break;
                case SyntaxKind.Whitespace:
                    if (Buffer[0] == ' ')
                    {
                        return " ";
                    }
                    if (Buffer[0] == '\t')
                    {
                        return "\t";
                    }
                    break;
                case SyntaxKind.NumericLiteral:
                    Debug.Fail("This should be handled by using the C# lexer's interned string in NumericLiteral()");
                    return base.GetTokenContent(type);
                case SyntaxKind.Not:
                case SyntaxKind.LeftParenthesis:
                case SyntaxKind.RightParenthesis:
                case SyntaxKind.Comma:
                case SyntaxKind.Dot:
                case SyntaxKind.Colon:
                case SyntaxKind.Semicolon:
                case SyntaxKind.QuestionMark:
                case SyntaxKind.RightBracket:
                case SyntaxKind.LeftBracket:
                case SyntaxKind.LeftBrace:
                case SyntaxKind.RightBrace:
                case SyntaxKind.LessThan:
                case SyntaxKind.Assign:
                case SyntaxKind.GreaterThan:
                    Debug.Fail("This should be handled by using the C# lexer's interned string in Operator()");
                    return base.GetTokenContent(type);
                case SyntaxKind.Transition:
                    return "@";

            }
        }
        else if (tokenLength == 2)
        {
            switch (type)
            {
                case SyntaxKind.NewLine:
                    return "\r\n";
                case SyntaxKind.DoubleColon:
                case SyntaxKind.Equals:
                    Debug.Fail("This should be handled by using the C# lexer's interned string in Operator()");
                    return base.GetTokenContent(type);
            }
        }

        return base.GetTokenContent(type);
    }

    protected override SyntaxToken CreateToken(string content, SyntaxKind kind, RazorDiagnostic[] errors)
    {
        return SyntaxFactory.Token(kind, content, errors);
    }

    private StateResult Start()
    {
        var leadingTriviaResult = GetNextResult(NextResultType.LeadingTrivia);
        Debug.Assert(leadingTriviaResult.ContextualKind == CSharpSyntaxKind.None);
        Debug.Assert(leadingTriviaResult.Token.IsKind(CSharpSyntaxKind.None));

        if (leadingTriviaResult.Token.HasLeadingTrivia)
        {
            _currentCSharpTokenTriviaEnumerator = (leadingTriviaResult.Token.LeadingTrivia.GetEnumerator(), isLeading: true);
            return Transition(RoslynCSharpTokenizerState.TriviaForCSharpToken, null);
        }
        else
        {
            return Transition(RoslynCSharpTokenizerState.Token, null);
        }
    }

    private StateResult Token()
    {
        if (SyntaxFacts.IsNewLine(CurrentCharacter) || SyntaxFacts.IsWhitespace(CurrentCharacter))
        {
            Assumed.Unreachable();
        }

        if (SyntaxFacts.IsIdentifierStartCharacter(CurrentCharacter))
        {
            return Identifier();
        }
        else if (char.IsDigit(CurrentCharacter))
        {
            return NumericLiteral();
        }
        switch (CurrentCharacter)
        {
            case '@':
                return AtToken();
            case '\'':
                return TokenizedExpectedStringOrCharacterLiteral(StringOrCharacterKind.Character);
            case '"':
                return TokenizedExpectedStringOrCharacterLiteral(StringOrCharacterKind.String_Or_Raw_String);
            case '$':
                switch (Peek())
                {
                    case '"' or '$':
                        return TokenizedExpectedStringOrCharacterLiteral(StringOrCharacterKind.Interpolated_Or_Raw_Interpolated_String);
                    case '@' when Peek(2) == '"':
                        return TokenizedExpectedStringOrCharacterLiteral(StringOrCharacterKind.Verbatim_Interpolated_Dollar_First_String);
                }
                goto default;
            case '.':
                if (char.IsDigit(Peek()))
                {
                    return NumericLiteral();
                }
                return Operator();
            case '/' when Peek() is '/' or '*':
                return Assumed.Unreachable<StateResult>();
            default:
                return Operator();
        }
    }

    private StateResult AtToken()
    {
        AssertCurrent('@');
        switch (Peek())
        {
            case '"':
                return TokenizedExpectedStringOrCharacterLiteral(StringOrCharacterKind.Verbatim_String);
            case '$' when Peek(2) is '"':
                return TokenizedExpectedStringOrCharacterLiteral(StringOrCharacterKind.Verbatim_Interpolated_At_First_String);
            case '*':
                return Assumed.Unreachable<StateResult>();
            case '@':
                // Escaped razor transition. Likely will error in the parser.
                AddResetPoint();
                TakeCurrent();
                _isOnlyWhitespaceOnLine = false;
                _roslynTokenParser.SkipForwardTo(Source.Position);
                AssertCurrent('@');
                return Transition(RoslynCSharpTokenizerState.Token, EndToken(SyntaxKind.Transition));
            default:
                // Either a standard razor transition or a C# identifier. The parser will take care of stitching together the transition and the
                // identifier if it's the latter case.
                AddResetPoint();
                TakeCurrent();
                _isOnlyWhitespaceOnLine = false;
                _roslynTokenParser.SkipForwardTo(Source.Position);
                var trailingTrivia = GetNextResult(NextResultType.TrailingTrivia);
                _currentCSharpTokenTriviaEnumerator = (trailingTrivia.Token.TrailingTrivia.GetEnumerator(), isLeading: false);
                return Transition(RoslynCSharpTokenizerState.TriviaForCSharpToken, EndToken(SyntaxKind.Transition));
        }

        void AddResetPoint()
        {
            // We want to make it easy to reset the tokenizer back to just before this token; we can do that very simply by trying to parse
            // leading trivia, which gives us a reset point. We know that we can't have any leading trivia, since we're on an `@` character.
            var nextResult = GetNextResult(NextResultType.LeadingTrivia);
            Debug.Assert(nextResult.Token.IsKind(CSharpSyntaxKind.None));
            Debug.Assert(nextResult.Token.FullSpan.Length == 0);
        }
    }

    private StateResult Operator()
    {
        var result = GetNextResult(NextResultType.Token);
        var token = result.Token;

        AdvancePastToken(token);
        string content;
        var kind = token.RawKind switch
        {
            (int)CSharpSyntaxKind.ExclamationToken => SyntaxKind.Not,
            (int)CSharpSyntaxKind.OpenParenToken => SyntaxKind.LeftParenthesis,
            (int)CSharpSyntaxKind.CloseParenToken => SyntaxKind.RightParenthesis,
            (int)CSharpSyntaxKind.CommaToken => SyntaxKind.Comma,
            (int)CSharpSyntaxKind.DotToken => SyntaxKind.Dot,
            (int)CSharpSyntaxKind.ColonColonToken => SyntaxKind.DoubleColon,
            (int)CSharpSyntaxKind.ColonToken => SyntaxKind.Colon,
            (int)CSharpSyntaxKind.OpenBraceToken => SyntaxKind.LeftBrace,
            (int)CSharpSyntaxKind.CloseBraceToken => SyntaxKind.RightBrace,
            (int)CSharpSyntaxKind.LessThanToken => SyntaxKind.LessThan,
            (int)CSharpSyntaxKind.GreaterThanToken => SyntaxKind.GreaterThan,
            (int)CSharpSyntaxKind.EqualsToken => SyntaxKind.Assign,
            (int)CSharpSyntaxKind.OpenBracketToken => SyntaxKind.LeftBracket,
            (int)CSharpSyntaxKind.CloseBracketToken => SyntaxKind.RightBracket,
            (int)CSharpSyntaxKind.QuestionToken => SyntaxKind.QuestionMark,
            (int)CSharpSyntaxKind.SemicolonToken => SyntaxKind.Semicolon,
            <= (int)CSharpSyntaxKind.GreaterThanGreaterThanGreaterThanEqualsToken and >= (int)CSharpSyntaxKind.TildeToken => SyntaxKind.CSharpOperator,
            _ => SyntaxKind.Marker,
        };

        // Use the already-interned string from the C# lexer, rather than realizing the buffer, to ensure that
        // we don't allocate a new string for every operator token.
        content = kind == SyntaxKind.Marker ? Buffer.ToString() : token.ValueText;
        Debug.Assert(content == Buffer.ToString());
        Buffer.Clear();

        _currentCSharpTokenTriviaEnumerator = (token.TrailingTrivia.GetEnumerator(), isLeading: false);
        return Transition(RoslynCSharpTokenizerState.TriviaForCSharpToken, EndToken(content, kind));
    }

    private StateResult TokenizedExpectedStringOrCharacterLiteral(StringOrCharacterKind expectedStringKind)
    {
        var result = GetNextResult(NextResultType.Token);
        var csharpToken = result.Token;
        (string expectedPrefix, string expectedPostfix, bool lookForPrePostFix) = expectedStringKind switch
        {
            StringOrCharacterKind.Character => ("'", "'", false),
            StringOrCharacterKind.Verbatim_String => ("@\"", "\"", false),
            StringOrCharacterKind.Verbatim_Interpolated_At_First_String => ("@$\"", "\"", false),
            StringOrCharacterKind.Verbatim_Interpolated_Dollar_First_String => ("$@\"", "\"", false),
            StringOrCharacterKind.String_Or_Raw_String when csharpToken.Text is "\"\"" => ("\"", "\"", false),
            StringOrCharacterKind.Interpolated_Or_Raw_Interpolated_String when csharpToken.Text is "$\"\"" => ("$\"", "\"", false),
            StringOrCharacterKind.String_Or_Raw_String or StringOrCharacterKind.Interpolated_Or_Raw_Interpolated_String => ("", "", true),
            _ => throw new InvalidOperationException($"Unexpected expectedStringKind: {expectedStringKind}."),
        };

        for (var finalPosition = Source.Position + csharpToken.Span.Length; Source.Position < finalPosition;)
        {
            if (lookForPrePostFix)
            {
                lookForPrePostFix = handleCurrentCharacterPrefixPostfix();
            }

            TakeCurrent();
        }

        // If the token is the expected kind and is only the expected prefix or doesn't have the expected postfix, then it's unterminated.
        // This is a case like `"test` (which doesn't end in the expected postfix), or `"` (which ends in the expected postfix, but
        // exactly matches the expected prefix).
        // Note: UTF-8 string literals can have a u8 or U8 suffix after the closing quote, e.g., "hello"u8
        if (lookForPrePostFix || csharpToken.Text == expectedPrefix || !IsStringProperlyTerminated(csharpToken.Text, expectedPostfix))
        {
            CurrentErrors.Add(
                RazorDiagnosticFactory.CreateParsing_UnterminatedStringLiteral(
                    new SourceSpan(CurrentStart, contentLength: expectedPrefix?.Length ?? 0 /* " */)));
        }

        _currentCSharpTokenTriviaEnumerator = (csharpToken.TrailingTrivia.GetEnumerator(), isLeading: false);
        var razorTokenKind = expectedStringKind == StringOrCharacterKind.Character ? SyntaxKind.CharacterLiteral : SyntaxKind.StringLiteral;
        return Transition(RoslynCSharpTokenizerState.TriviaForCSharpToken, EndToken(razorTokenKind));

        bool handleCurrentCharacterPrefixPostfix()
        {
            switch (expectedStringKind)
            {
                case StringOrCharacterKind.String_Or_Raw_String:
                    // We can either have a normal string or a raw string. Add to the prefix/postfix until we find a non-" character
                    if (CurrentCharacter != '"')
                    {
                        Debug.Assert(expectedPrefix != null);
                        Debug.Assert(expectedPostfix != null);
                        Debug.Assert(expectedPrefix == expectedPostfix);
                        return false;
                    }

                    expectedPrefix += '"';
                    expectedPostfix += '"';
                    return true;

                case StringOrCharacterKind.Interpolated_Or_Raw_Interpolated_String:
                    // Start with the leading $'s
                    if (expectedPrefix is "" or [.., '$'])
                    {
                        if (CurrentCharacter == '$')
                        {
                            expectedPrefix += '$';
                            return true;
                        }
                        else if (CurrentCharacter == '"')
                        {
                            expectedPrefix += '"';
                            expectedPostfix += '"';
                            return true;
                        }
                        else
                        {
                            // We expect roslyn to have ended parsing, so we should never get here
                            return Assumed.Unreachable<bool>();
                        }
                    }

                    Debug.Assert(expectedPrefix[^1] == '"');
                    if (CurrentCharacter == '"')
                    {
                        expectedPrefix += '"';
                        expectedPostfix += '"';
                        return true;
                    }
                    else
                    {
                        return false;
                    }

                case StringOrCharacterKind.Character:
                case StringOrCharacterKind.Verbatim_String:
                case StringOrCharacterKind.Verbatim_Interpolated_At_First_String:
                case StringOrCharacterKind.Verbatim_Interpolated_Dollar_First_String:
                default:
                    return Assumed.Unreachable<bool>();
            }
        }
    }


    private StateResult Trivia()
    {
        Debug.Assert(_currentCSharpTokenTriviaEnumerator is not null);
        var (triviaEnumerator, isLeading) = _currentCSharpTokenTriviaEnumerator.Value;

        if (!triviaEnumerator.MoveNext())
        {
            _currentCSharpTokenTriviaEnumerator = null;
            return Transition(isLeading ? RoslynCSharpTokenizerState.Token : RoslynCSharpTokenizerState.Start, null);
        }

        // Need to make sure the class state is correct, since structs are copied
        _currentCSharpTokenTriviaEnumerator = (triviaEnumerator, isLeading);

        var trivia = triviaEnumerator.Current;
        var triviaString = trivia.ToFullString();

        // We handle razor comments with dedicated nodes
        if (trivia.IsKind(CSharpSyntaxKind.MultiLineCommentTrivia) && triviaString.StartsWith("@*", StringComparison.Ordinal))
        {
            Debug.Assert(CurrentCharacter == '@');
            TakeCurrent();

            return Transition(
                RoslynCSharpTokenizerState.OnRazorCommentStar,
                EndToken(SyntaxKind.RazorCommentTransition));
        }

        // Use FullSpan here because doc comment trivias exclude the leading `///` or `/**` and the trailing `*/`
        AdvancePastSpan(trivia.FullSpan);

        if (EndOfFile
            && trivia.Kind() is CSharpSyntaxKind.MultiLineCommentTrivia or CSharpSyntaxKind.MultiLineDocumentationCommentTrivia
            && !triviaString.EndsWith("*/", StringComparison.Ordinal))
        {
            CurrentErrors.Add(
                RazorDiagnosticFactory.CreateParsing_BlockCommentNotTerminated(
                    new SourceSpan(CurrentStart, contentLength: 1 /* end of file */)));
        }

        SyntaxKind tokenType;
        switch (trivia.Kind())
        {
            case CSharpSyntaxKind.WhitespaceTrivia:
                tokenType = SyntaxKind.Whitespace;
                break;
            case CSharpSyntaxKind.EndOfLineTrivia:
                tokenType = SyntaxKind.NewLine;
                _isOnlyWhitespaceOnLine = true;
                break;
            case CSharpSyntaxKind.SkippedTokensTrivia:
                // We treat skipped tokens as comments because they're tokens that were skipped over by roslyn,
                // and we want to keep them in the output so that the final C# ends up failing due to their presence.
                // We also don't want them to cause loops over comments and other trivia to break.
                tokenType = SyntaxKind.CSharpComment;

                // SkippedTokenTrivia is used for trailing directives; they can consume the trailing newline, so we need to reset _isOnlyWhitespaceOnLine if the trivia ends with one
                _isOnlyWhitespaceOnLine = triviaString.EndsWith('\n');

                // Look for any misplaced directives in the skipped tokens and error if we find them
                if (triviaString.Contains('#'))
                {
                    CurrentErrors.Add(
                        RazorDiagnosticFactory.CreateParsing_PreprocessorDirectivesMustBeAtTheStartOfLine(
                            // Won't be quite precise, but it's close enough
                            new SourceSpan(CurrentStart, contentLength: trivia.FullSpan.Length)));
                }

                break;
            case CSharpSyntaxKind.SingleLineCommentTrivia or
                 CSharpSyntaxKind.MultiLineCommentTrivia or
                 CSharpSyntaxKind.MultiLineDocumentationCommentTrivia or
                 CSharpSyntaxKind.SingleLineDocumentationCommentTrivia:
                tokenType = SyntaxKind.CSharpComment;
                _isOnlyWhitespaceOnLine = false;
                break;
            case var kind when SyntaxFacts.IsPreprocessorDirective(kind):
                tokenType = SyntaxKind.CSharpDirective;

                if (!_isOnlyWhitespaceOnLine)
                {
                    CurrentErrors.Add(
                        RazorDiagnosticFactory.CreateParsing_PreprocessorDirectivesMustBeAtTheStartOfLine(
                            new SourceSpan(CurrentStart, contentLength: trivia.FullSpan.Length)));
                }

                var directiveTrivia = (DirectiveTriviaSyntax)trivia.GetStructure()!;
                Debug.Assert(directiveTrivia != null);

                if (directiveTrivia is DefineDirectiveTriviaSyntax or UndefDirectiveTriviaSyntax)
                {
                    CurrentErrors.Add(
                        RazorDiagnosticFactory.CreateParsing_DefineAndUndefNotAllowed(
                            new SourceSpan(CurrentStart, contentLength: directiveTrivia.FullSpan.Length)));
                }

                _isOnlyWhitespaceOnLine = directiveTrivia.EndOfDirectiveToken.TrailingTrivia is [.., { RawKind: (int)CSharpSyntaxKind.EndOfLineTrivia }];
                break;
            case CSharpSyntaxKind.DisabledTextTrivia:
                tokenType = SyntaxKind.CSharpDisabledText;
                _isOnlyWhitespaceOnLine = true;

                // We want to scan through the disabled text and see if someone misplaced an #else or #endif by not putting it at the start of a line. We can't truly
                // be certain; for example, it could be in html, intentionally. But this is just a warning, and the user can disable it; since we made a breaking change
                // and there could be directives not at the start of a line, we want to be helpful.

                {
                    for (var i = 0; i < triviaString.Length; i++)
                    {
                        var currentChar = triviaString[i];
                        switch (currentChar)
                        {
                            case '\r':
                            case '\n':
                                _isOnlyWhitespaceOnLine = true;
                                break;

                            case '#' when !_isOnlyWhitespaceOnLine:
                                // If there is only whitespace on the current line, and we're about to see a directive, then it clearly wasn't
                                // #endif or #else
                                var start = CurrentStart.AbsoluteIndex + i;
                                if (startsWith("else"))
                                {
                                    var length = "#else".Length;
                                    var linePosition = Source.SourceText.Lines.GetLinePosition(start);
                                    CurrentErrors.Add(
                                        RazorDiagnosticFactory.CreateParsing_PossibleMisplacedPreprocessorDirective(
                                            new SourceSpan(
                                                filePath: CurrentStart.FilePath,
                                                absoluteIndex: start,
                                                lineIndex: linePosition.Line,
                                                characterIndex: linePosition.Character,
                                                length)));
                                    i += 4;
                                }
                                else if (startsWith("endif"))
                                {
                                    var length = "#endif".Length;
                                    var linePosition = Source.SourceText.Lines.GetLinePosition(start);
                                    CurrentErrors.Add(
                                        RazorDiagnosticFactory.CreateParsing_PossibleMisplacedPreprocessorDirective(
                                            new SourceSpan(
                                                filePath: CurrentStart.FilePath,
                                                absoluteIndex: start,
                                                lineIndex: linePosition.Line,
                                                characterIndex: linePosition.Character,
                                                length)));
                                    i += 5;
                                }

                                break;

                                bool startsWith(string substring)
                                {
                                    Debug.Assert(currentChar == '#');
                                    if (i + 1 + substring.Length > triviaString.Length)
                                    {
                                        return false;
                                    }

                                    return triviaString.AsSpan()[(i + 1)..].StartsWith(substring.AsSpan());
                                }

                            default:
                                if (!SyntaxFacts.IsWhitespace(currentChar))
                                {
                                    _isOnlyWhitespaceOnLine = false;
                                }
                                break;
                        }
                    }
                }
                break;
            case CSharpSyntaxKind.ConflictMarkerTrivia:
                tokenType = SyntaxKind.ConflictMarkerTrivia;
                // conflict markers are only parsed by Roslyn if they start the line, and everything on that line is considered part of the marker
                _isOnlyWhitespaceOnLine = true; 
                break;
            case var kind:
                throw new InvalidOperationException($"Unexpected trivia kind: {kind}.");
        };

        return Stay(EndToken(tokenType));
    }

    private StateResult OnRazorCommentStar()
    {
        AssertCurrent('*');
        TakeCurrent();

        return Transition(
            RoslynCSharpTokenizerState.RazorCommentBody,
            EndToken(SyntaxKind.RazorCommentStar));
    }

    // CSharp Spec §2.4.4
    private StateResult NumericLiteral()
    {
        var result = GetNextResult(NextResultType.Token);
        var csharpToken = result.Token;
        AdvancePastToken(csharpToken);

        Buffer.Clear();
        _currentCSharpTokenTriviaEnumerator = (csharpToken.TrailingTrivia.GetEnumerator(), isLeading: false);
        return Transition(RoslynCSharpTokenizerState.TriviaForCSharpToken, EndToken(csharpToken.Text, SyntaxKind.NumericLiteral));
    }

    private StateResult Identifier()
    {
        var result = GetNextResult(NextResultType.Token);
        var csharpToken = result.Token;
        AdvancePastToken(csharpToken);

        var type = SyntaxKind.Identifier;
        if (!csharpToken.IsKind(CSharpSyntaxKind.IdentifierToken) || result.ContextualKind is not (CSharpSyntaxKind.None or CSharpSyntaxKind.IdentifierToken))
        {
            type = SyntaxKind.Keyword;
        }

        var token = EndToken(csharpToken.Text, type);

        Buffer.Clear();
        _currentCSharpTokenTriviaEnumerator = (csharpToken.TrailingTrivia.GetEnumerator(), isLeading: false);
        return Transition(RoslynCSharpTokenizerState.TriviaForCSharpToken, token);
    }

    private void AdvancePastToken(CSharpSyntaxToken csharpToken)
    {
        // Don't include trailing trivia; we handle that differently than Roslyn
        AdvancePastSpan(csharpToken.Span);
    }

    private void AdvancePastSpan(TextSpan span)
    {
        var finalPosition = Source.Position + span.Length;

        for (; Source.Position < finalPosition;)
        {
            TakeCurrent();
        }
    }

    private StateResult Transition(RoslynCSharpTokenizerState state, SyntaxToken? result)
    {
        return Transition((int)state, result);
    }

    internal override CSharpSyntaxKind? GetTokenKeyword(SyntaxToken token)
    {
        if (token is null)
        {
            return CSharpSyntaxKind.None;
        }

        var content = token.Content;
        return SyntaxFacts.GetKeywordKind(content) is var kind and not CSharpSyntaxKind.None
            ? kind
            : SyntaxFacts.GetContextualKeywordKind(content);
    }

    private SyntaxTokenParser.Result GetNextResult(NextResultType expectedType)
    {
        var nextResult = expectedType switch
        {
            NextResultType.LeadingTrivia => _roslynTokenParser.ParseLeadingTrivia(),
            NextResultType.Token => _roslynTokenParser.ParseNextToken(),
            NextResultType.TrailingTrivia => _roslynTokenParser.ParseTrailingTrivia(),
            _ => Assumed.Unreachable<SyntaxTokenParser.Result>()
        };

        Debug.Assert(_resultCache.All(r => r.position <= nextResult.Token.FullSpan.Start));

        if (_resultCache.Count > 0 && _resultCache[^1].position == nextResult.Token.FullSpan.Start)
        {
            // This can happen when there was no leading or trailing trivia for this token. We don't need to maintain both the previous
            // result and the current result, as the current result fully subsumes it.
            Debug.Assert(_resultCache[^1].result is { Token.FullSpan.Length: 0 });
            Debug.Assert(!nextResult.Token.HasLeadingTrivia);
            _resultCache[^1] = (nextResult.Token.FullSpan.Start, nextResult, _isOnlyWhitespaceOnLine);
        }
        else
        {
            _resultCache.Add((nextResult.Token.FullSpan.Start, nextResult, _isOnlyWhitespaceOnLine));
        }

        if (!nextResult.Token.IsKind(CSharpSyntaxKind.None))
        {
            _isOnlyWhitespaceOnLine = false;
        }

        return nextResult;
    }

    public override void Reset(int position)
    {
        // Most common reset point is the last parsed token, so just try that first.
        Debug.Assert(_resultCache.Count > 0);

        // We always walk backwards from the current position, rather than doing a binary search, because the common pattern in the parser is
        // to put tokens back in the order they were returned. This means that the most common reset point is the last token that was parsed.
        // If this ever changes, we can consider doing a binary search at that point.
        for (var i = _resultCache.Count - 1; i >= 0; i--)
        {
            var (currentPosition, currentResult, isOnlyWhitespaceOnLine) = _resultCache[i];
            if (currentPosition == position)
            {
                // We found an exact match, so we can reset to it directly.
                _roslynTokenParser.ResetTo(currentResult);
                _isOnlyWhitespaceOnLine = isOnlyWhitespaceOnLine;
                _resultCache.RemoveAt(i);
                base.CurrentState = (int)RoslynCSharpTokenizerState.Start;
#if DEBUG
                var oldIsOnlyWhitespaceOnLine = _isOnlyWhitespaceOnLine;
                ResetIsOnlyWhitespaceOnLine();
                Debug.Assert(isOnlyWhitespaceOnLine == _isOnlyWhitespaceOnLine);
#endif
                return;
            }
            else if (currentPosition < position)
            {
                // Reset to the one before reset point, then skip forward
                // We don't want to actually remove the result from the cache in this point: the parser could later ask to reset further back in this same result. This mostly happens for trivia, where the
                // parser may ask to put multiple tokens back, each part of the same roslyn trivia piece. However, it is not _only_ for trivia, so we can't assert that. The parser may decide, for example,
                // to split the @ from a token, reset the tokenizer after the @, and then keep going.
                _roslynTokenParser.ResetTo(currentResult);
                _roslynTokenParser.SkipForwardTo(position);
                base.CurrentState = (int)RoslynCSharpTokenizerState.Start;
                // Can't reuse the isOnlyWhitespaceOnLine value, since we're not before the start of the result anymore.
                ResetIsOnlyWhitespaceOnLine();
                return;
            }
            else
            {
                // We know we're not going to be interested in this reset point anymore, so we can remove it.
                Debug.Assert(currentPosition > position);
                _resultCache.RemoveAt(i);
            }
        }
    }

    public override void Dispose()
    {
        base.Dispose();
        _roslynTokenParser.Dispose();
        ListPool<(int, SyntaxTokenParser.Result, bool)>.Default.Return(_resultCache);
    }

    private static bool IsStringProperlyTerminated(string tokenText, string expectedPostfix)
    {
        // Check if the string ends with the expected postfix (e.g., ")
        if (tokenText.EndsWith(expectedPostfix, StringComparison.Ordinal))
        {
            return true;
        }

        // Check if it's a UTF-8 string literal with u8 or U8 suffix
        // UTF-8 strings have the format: prefix + content + postfix + "u8"
        // The minimum valid UTF-8 string is when content is empty.
        // For a regular string: ""u8 is 4 characters total (opening quote + closing quote + u8)
        // For a raw string: """content"""u8 is at least 8 characters (3 opening quotes + content + 3 closing quotes + u8)
        // The expectedPostfix can vary: for "" it's "", for "x" it's ", for """x""" it's """
        // So minimum length is expectedPostfix.Length (closing delimiter) + 2 (u8 suffix)
        if (tokenText.Length >= expectedPostfix.Length + 2)
        {
            var suffix = tokenText.AsSpan(tokenText.Length - 2);
            if (suffix is ['u' or 'U', '8'])
            {
                // Check if the part before the suffix ends with the expected postfix
                var textBeforeSuffix = tokenText.AsSpan(0, tokenText.Length - 2);
                return textBeforeSuffix.EndsWith(expectedPostfix.AsSpan(), StringComparison.Ordinal);
            }
        }

        return false;
    }

    private enum NextResultType
    {
        LeadingTrivia,
        Token,
        TrailingTrivia,
    }

    private enum StringOrCharacterKind
    {
        Character,
        String_Or_Raw_String,
        Interpolated_Or_Raw_Interpolated_String,
        Verbatim_String,
        Verbatim_Interpolated_At_First_String,
        Verbatim_Interpolated_Dollar_First_String,
        Verbatim_Interpolated_String,
    }

    private enum RoslynCSharpTokenizerState
    {
        Start,
        Token,
        TriviaForCSharpToken,

        // Razor Comments - need to be the same for HTML and CSharp
        OnRazorCommentStar,
        AfterRazorCommentTransition = RazorCommentTokenizerState.AfterRazorCommentTransition,
        RazorCommentBody = RazorCommentTokenizerState.RazorCommentBody,
        StarAfterRazorCommentBody = RazorCommentTokenizerState.StarAfterRazorCommentBody,
        AtTokenAfterRazorCommentBody = RazorCommentTokenizerState.AtTokenAfterRazorCommentBody,
    }
}
File: Language\Legacy\RoslynCSharpTokenizer.cs	Web Access
Project: src\roslyn\src\Razor\src\Compiler\Microsoft.CodeAnalysis.Razor.Compiler\src\Microsoft.CodeAnalysis.Razor.Compiler.csproj (Microsoft.CodeAnalysis.Razor.Compiler)