Lexer_RawStringLiteral.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Text;
using Microsoft.CodeAnalysis.PooledObjects;
using Roslyn.Utilities;
 
namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax
{
    internal partial class Lexer
    {
        /// <returns>The number of quotes that were consumed</returns>
        private int ConsumeCharSequence(char ch)
        {
            var start = TextWindow.Position;
            while (TextWindow.PeekChar() == ch)
                TextWindow.AdvanceChar();
 
            return TextWindow.Position - start;
        }
 
        private int ConsumeQuoteSequence()
            => ConsumeCharSequence('"');
 
        private int ConsumeDollarSignSequence()
            => ConsumeCharSequence('$');
 
        private int ConsumeAtSignSequence()
            => ConsumeCharSequence('@');
 
        private int ConsumeOpenBraceSequence()
            => ConsumeCharSequence('{');
 
        private int ConsumeCloseBraceSequence()
            => ConsumeCharSequence('}');
 
        private void ConsumeWhitespace(StringBuilder? builder)
        {
            while (true)
            {
                var ch = TextWindow.PeekChar();
                if (!SyntaxFacts.IsWhitespace(ch))
                    break;
 
                builder?.Append(ch);
                TextWindow.AdvanceChar();
            }
        }
 
        private bool IsAtEndOfText(char currentChar)
            => currentChar == SlidingTextWindow.InvalidCharacter && TextWindow.IsReallyAtEnd();
 
        private void ScanRawStringLiteral(ref TokenInfo info, bool inDirective)
        {
            _builder.Length = 0;
 
            var startingQuoteCount = ConsumeQuoteSequence();
 
            Debug.Assert(startingQuoteCount >= 3);
 
            // Keep consuming whitespace after the initial quote sequence.
            ConsumeWhitespace(builder: null);
 
            if (SyntaxFacts.IsNewLine(TextWindow.PeekChar()))
            {
                // Past the initial whitespace, and we hit a newline, this is a multi line raw string literal.
                ScanMultiLineRawStringLiteral(ref info, startingQuoteCount);
            }
            else
            {
                // Past the initial whitespace, and we hit anything else, this is a single line raw string literal.
                ScanSingleLineRawStringLiteral(ref info, startingQuoteCount);
            }
 
            // If we encounter any errors while scanning this raw string then we can't really determine the true
            // value of the string.  So just do what we do with the normal strings and treat the contents as the
            // value from after the starting quote to the current position.  Note that for normal strings this will
            // have interpreted things like escape sequences.  However, as we're a raw string and there are no 
            // escapes, we can just grab the text block directly.  This does mean that things like leading indentation
            // will not be stripped, and that multiline raw strings will contain the contents of their first line.
            // However, as this is error code anyways, the interpretation of the value is fine for us to define
            // however we want.  The user can (and should) check for the presence of diagnostics before blindly
            // trusting the contents.
            if (this.HasErrors)
            {
                var afterStartDelimiter = this.LexemeStartPosition + startingQuoteCount;
                var valueLength = TextWindow.Position - afterStartDelimiter;
 
                info.StringValue = TextWindow.GetText(
                    position: afterStartDelimiter,
                    length: valueLength,
                    intern: true);
            }
            else
            {
                // If we didn't have an error, the subroutines better have set the string value for this literal.
                Debug.Assert(info.StringValue != null);
            }
 
            Debug.Assert(info.Kind is (SyntaxKind.SingleLineRawStringLiteralToken or SyntaxKind.MultiLineRawStringLiteralToken));
 
            if (!inDirective && ScanUtf8Suffix())
            {
                switch (info.Kind)
                {
                    case SyntaxKind.SingleLineRawStringLiteralToken:
                        info.Kind = SyntaxKind.Utf8SingleLineRawStringLiteralToken;
                        break;
 
                    case SyntaxKind.MultiLineRawStringLiteralToken:
                        info.Kind = SyntaxKind.Utf8MultiLineRawStringLiteralToken;
                        break;
 
                    default:
                        throw ExceptionUtilities.UnexpectedValue(info.Kind);
                }
            }
 
            info.Text = this.GetInternedLexemeText();
        }
 
        private void ScanSingleLineRawStringLiteral(ref TokenInfo info, int startingQuoteCount)
        {
            info.Kind = SyntaxKind.SingleLineRawStringLiteralToken;
 
            while (true)
            {
                var currentChar = TextWindow.PeekChar();
 
                // See if we reached the end of the line or file before hitting the end.
                if (SyntaxFacts.IsNewLine(currentChar))
                {
                    this.AddError(TextWindow.Position, width: TextWindow.GetNewLineWidth(), ErrorCode.ERR_UnterminatedRawString);
                    return;
                }
                else if (IsAtEndOfText(currentChar))
                {
                    this.AddError(TextWindow.Position, width: 0, ErrorCode.ERR_UnterminatedRawString);
                    return;
                }
 
                if (currentChar != '"')
                {
                    // anything not a quote sequence just moves it forward.
                    TextWindow.AdvanceChar();
                    continue;
                }
 
                var beforeEndDelimiter = TextWindow.Position;
                var currentQuoteCount = ConsumeQuoteSequence();
 
                // A raw string literal starting with some number of quotes can contain a quote sequence with fewer quotes.
                if (currentQuoteCount < startingQuoteCount)
                    continue;
 
                // A raw string could never be followed by another string.  So once we've consumed all the closing quotes
                // if we have any more closing quotes then that's an error we can give a message for.
                if (currentQuoteCount > startingQuoteCount)
                {
                    var excessQuoteCount = currentQuoteCount - startingQuoteCount;
                    this.AddError(
                        position: TextWindow.Position - excessQuoteCount,
                        width: excessQuoteCount,
                        ErrorCode.ERR_TooManyQuotesForRawString);
                }
 
                // We have enough quotes to finish this string at this point.
                var afterStartDelimiter = this.LexemeStartPosition + startingQuoteCount;
                var valueLength = beforeEndDelimiter - afterStartDelimiter;
 
                info.StringValue = TextWindow.GetText(
                    position: afterStartDelimiter,
                    length: valueLength,
                    intern: true);
                return;
            }
        }
 
        private void ScanMultiLineRawStringLiteral(ref TokenInfo info, int startingQuoteCount)
        {
            info.Kind = SyntaxKind.MultiLineRawStringLiteralToken;
 
            // The indentation-whitespace computed from the very last line of the raw string literal
            var indentationWhitespace = PooledStringBuilder.GetInstance();
 
            // The leading whitespace of whatever line we are currently on.
            var currentLineWhitespace = PooledStringBuilder.GetInstance();
            try
            {
                // Do the first pass, finding the end of the raw string, and determining the 'indentation whitespace'
                // that must be complimentary with all content lines of the raw string literal.
                var afterStartDelimiter = TextWindow.Position;
                Debug.Assert(SyntaxFacts.IsNewLine(TextWindow.PeekChar()));
 
                var contentLineCount = 0;
                while (ScanMultiLineRawStringLiteralLine(startingQuoteCount, indentationWhitespace.Builder))
                    contentLineCount++;
 
                // If the initial scan failed then just bail out without a constant value.
                if (this.HasErrors)
                    return;
 
                // The trivial raw string literal is not legal in the language.
                if (contentLineCount == 0)
                {
                    this.AddError(
                        position: TextWindow.Position - startingQuoteCount,
                        width: startingQuoteCount,
                        ErrorCode.ERR_RawStringMustContainContent);
                    return;
                }
 
                // Now, do the second pass, building up the literal value.  This may produce an error as well if the
                // indentation whitespace of the lines isn't complimentary.
 
                // Reset us to right after the starting delimiter.  Note: if we fail to generate a constant value we'll
                // ensure that we reset back to the original end we scanned to above.
                var tokenEnd = TextWindow.Position;
                TextWindow.Reset(afterStartDelimiter);
                Debug.Assert(SyntaxFacts.IsNewLine(TextWindow.PeekChar()));
 
                for (var currentLine = 0; currentLine < contentLineCount; currentLine++)
                {
                    AddMultiLineRawStringLiteralLineContents(
                        indentationWhitespace.Builder,
                        currentLineWhitespace.Builder,
                        firstContentLine: currentLine == 0);
 
                    // If processing the line produced errors, then bail out from continued processing.
                    if (this.HasErrors)
                        break;
                }
 
                info.StringValue = this.HasErrors ? "" : TextWindow.Intern(_builder);
 
                // Make sure that even if we fail to determine the constant content value of the string that
                // we still consume all the way to original end that we computed.
                TextWindow.Reset(tokenEnd);
            }
            finally
            {
                indentationWhitespace.Free();
                currentLineWhitespace.Free();
            }
        }
 
        private bool ScanMultiLineRawStringLiteralLine(
            int startingQuoteCount, StringBuilder indentationWhitespace)
        {
            TextWindow.AdvancePastNewLine();
 
            indentationWhitespace.Clear();
            ConsumeWhitespace(indentationWhitespace);
 
            // after the whitespace see if this the line that ends the multiline literal.
            var currentQuoteCount = ConsumeQuoteSequence();
            if (currentQuoteCount >= startingQuoteCount)
            {
                // A raw string could never be followed by another string.  So once we've consumed all the closing quotes
                // if we have any more closing quotes then that's an error we can give a message for.
                if (currentQuoteCount > startingQuoteCount)
                {
                    var excessQuoteCount = currentQuoteCount - startingQuoteCount;
                    this.AddError(
                        position: TextWindow.Position - excessQuoteCount,
                        width: excessQuoteCount,
                        ErrorCode.ERR_TooManyQuotesForRawString);
                }
 
                // Done scanning lines.
                return false;
            }
 
            // We're not on the terminating line. Consume a normal content line.  Eat to the end of line (or file in the
            // case of errors).
            while (true)
            {
                var currentChar = TextWindow.PeekChar();
                if (IsAtEndOfText(currentChar))
                {
                    this.AddError(TextWindow.Position, width: 0, ErrorCode.ERR_UnterminatedRawString);
                    return false;
                }
 
                if (SyntaxFacts.IsNewLine(currentChar))
                    return true;
 
                if (currentChar == '"')
                {
                    // Don't allow a content line to contain a quote sequence that looks like a delimiter (or longer)
                    currentQuoteCount = ConsumeQuoteSequence();
                    if (currentQuoteCount >= startingQuoteCount)
                    {
                        this.AddError(
                            position: TextWindow.Position - currentQuoteCount,
                            width: currentQuoteCount,
                            ErrorCode.ERR_RawStringDelimiterOnOwnLine);
                        return false;
                    }
                }
                else
                {
                    TextWindow.AdvanceChar();
                }
            }
        }
 
        private void AddMultiLineRawStringLiteralLineContents(
            StringBuilder indentationWhitespace,
            StringBuilder currentLineWhitespace,
            bool firstContentLine)
        {
            Debug.Assert(SyntaxFacts.IsNewLine(TextWindow.PeekChar()));
 
            var newLineWidth = TextWindow.GetNewLineWidth();
            for (var i = 0; i < newLineWidth; i++)
            {
                // the initial newline in `"""   \r\n` is not added to the contents.
                if (!firstContentLine)
                    _builder.Append(TextWindow.PeekChar());
 
                TextWindow.AdvanceChar();
            }
 
            var lineStartPosition = TextWindow.Position;
            currentLineWhitespace.Clear();
            ConsumeWhitespace(currentLineWhitespace);
 
            if (!StartsWith(currentLineWhitespace, indentationWhitespace))
            {
                // We have a line where the indentation of that line isn't a prefix of indentation
                // whitespace.
                //
                // If we're not on a blank line then this is bad.  That's a content line that doesn't start
                // with the indentation whitespace.  If we are on a blank line then it's ok if the whitespace
                // we do have is a prefix of the indentation whitespace.
                var isBlankLine = SyntaxFacts.IsNewLine(TextWindow.PeekChar());
                var isLegalBlankLine = isBlankLine && StartsWith(indentationWhitespace, currentLineWhitespace);
                if (!isLegalBlankLine)
                {
                    // Specialized error message if this is a spacing difference.
                    if (CheckForSpaceDifference(
                            currentLineWhitespace, indentationWhitespace,
                            out var currentLineWhitespaceChar, out var indentationWhitespaceChar))
                    {
                        this.AddError(
                            lineStartPosition,
                            width: TextWindow.Position - lineStartPosition,
                            ErrorCode.ERR_LineContainsDifferentWhitespace,
                            currentLineWhitespaceChar, indentationWhitespaceChar);
                    }
                    else
                    {
                        this.AddError(
                            lineStartPosition,
                            width: TextWindow.Position - lineStartPosition,
                            ErrorCode.ERR_LineDoesNotStartWithSameWhitespace);
                    }
                    return;
                }
            }
 
            // Skip the leading whitespace that matches the terminator line and add any whitespace past that to the
            // string value.  Note: if the current line is shorter than the indentation whitespace, this will
            // intentionally copy nothing.
#if NET
            _builder.Append(currentLineWhitespace, startIndex: indentationWhitespace.Length, count: Math.Max(0, currentLineWhitespace.Length - indentationWhitespace.Length));
#else
            for (var i = indentationWhitespace.Length; i < currentLineWhitespace.Length; i++)
                _builder.Append(currentLineWhitespace[i]);
#endif
 
            // Consume up to the next new line.
            while (true)
            {
                var currentChar = TextWindow.PeekChar();
 
                if (SyntaxFacts.IsNewLine(currentChar))
                    return;
 
                _builder.Append(currentChar);
                TextWindow.AdvanceChar();
            }
        }
 
        private static bool CheckForSpaceDifference(
            StringBuilder currentLineWhitespace,
            StringBuilder indentationLineWhitespace,
            [NotNullWhen(true)] out string? currentLineMessage,
            [NotNullWhen(true)] out string? indentationLineMessage)
        {
            for (int i = 0, n = Math.Min(currentLineWhitespace.Length, indentationLineWhitespace.Length); i < n; i++)
            {
                var currentLineChar = currentLineWhitespace[i];
                var indentationLineChar = indentationLineWhitespace[i];
 
                if (currentLineChar != indentationLineChar &&
                    SyntaxFacts.IsWhitespace(currentLineChar) &&
                    SyntaxFacts.IsWhitespace(indentationLineChar))
                {
                    currentLineMessage = CharToString(currentLineChar);
                    indentationLineMessage = CharToString(indentationLineChar);
                    return true;
                }
            }
 
            currentLineMessage = null;
            indentationLineMessage = null;
            return false;
        }
 
        public static string CharToString(char ch)
        {
            return ch switch
            {
                '\t' => @"\t",
                '\v' => @"\v",
                '\f' => @"\f",
                _ => @$"\u{(int)ch:x4}",
            };
        }
 
        /// <summary>
        /// Returns true if <paramref name="sb"/> starts with <paramref name="value"/>.
        /// </summary>
        private static bool StartsWith(StringBuilder sb, StringBuilder value)
        {
            if (sb.Length < value.Length)
                return false;
 
            for (int i = 0; i < value.Length; i++)
            {
                if (sb[i] != value[i])
                    return false;
            }
 
            return true;
        }
    }
}
File: Parser\Lexer_RawStringLiteral.cs	Web Access
Project: src\src\Compilers\CSharp\Portable\Microsoft.CodeAnalysis.CSharp.csproj (Microsoft.CodeAnalysis.CSharp)