CSharpVirtualCharService.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Diagnostics;
using System.Linq;
using System.Text;
using Microsoft.CodeAnalysis.Collections;
using Microsoft.CodeAnalysis.CSharp.LanguageService;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.LanguageService;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Shared.Extensions;
using Microsoft.CodeAnalysis.Text;
using Roslyn.Utilities;
 
namespace Microsoft.CodeAnalysis.CSharp.EmbeddedLanguages.VirtualChars;
 
internal class CSharpVirtualCharService : AbstractVirtualCharService
{
    public static readonly IVirtualCharService Instance = new CSharpVirtualCharService();
 
    private static readonly ObjectPool<ImmutableSegmentedList<VirtualCharGreen>.Builder> s_pooledBuilders = new(() => ImmutableSegmentedList.CreateBuilder<VirtualCharGreen>());
 
    protected CSharpVirtualCharService()
    {
    }
 
    protected override ISyntaxFacts SyntaxFacts => CSharpSyntaxFacts.Instance;
 
    protected override bool IsMultiLineRawStringToken(SyntaxToken token)
    {
        if (token.Kind() is SyntaxKind.MultiLineRawStringLiteralToken or SyntaxKind.Utf8MultiLineRawStringLiteralToken)
            return true;
 
        if (token.Parent?.Parent is InterpolatedStringExpressionSyntax { StringStartToken.RawKind: (int)SyntaxKind.InterpolatedMultiLineRawStringStartToken })
            return true;
 
        return false;
    }
 
    protected override VirtualCharGreenSequence TryConvertToVirtualCharsWorker(SyntaxToken token)
    {
        // C# preprocessor directives can contain string literals.  However, these string literals do not behave
        // like normal literals.  Because they are used for paths (i.e. in a #line directive), the language does not
        // do any escaping within them.  i.e. if you have a \ it's just a \   Note that this is not a verbatim
        // string.  You can't put a double quote in it either, and you cannot have newlines and whatnot.
        //
        // We technically could convert this trivially to an array of virtual chars.  After all, there would just be
        // a 1:1 correspondence with the literal contents and the chars returned.  However, we don't even both
        // returning anything here.  That's because there's no useful features we can offer here.  Because there are
        // no escape characters we won't classify any escape characters.  And there is no way that these strings
        // would be Regex/Json snippets.  So it's easier to just bail out and return nothing.
        if (IsInDirective(token.Parent))
            return default;
 
        Debug.Assert(!token.ContainsDiagnostics);
 
        switch (token.Kind())
        {
            case SyntaxKind.CharacterLiteralToken:
                return TryConvertStringToVirtualChars(token, "'", "'", escapeBraces: false);
 
            case SyntaxKind.StringLiteralToken:
                return token.IsVerbatimStringLiteral()
                    ? TryConvertVerbatimStringToVirtualChars(token, "@\"", "\"", escapeBraces: false)
                    : TryConvertStringToVirtualChars(token, "\"", "\"", escapeBraces: false);
 
            case SyntaxKind.Utf8StringLiteralToken:
                return token.IsVerbatimStringLiteral()
                    ? TryConvertVerbatimStringToVirtualChars(token, "@\"", "\"u8", escapeBraces: false)
                    : TryConvertStringToVirtualChars(token, "\"", "\"u8", escapeBraces: false);
 
            case SyntaxKind.SingleLineRawStringLiteralToken:
            case SyntaxKind.Utf8SingleLineRawStringLiteralToken:
                return TryConvertSingleLineRawStringToVirtualChars(token);
 
            case SyntaxKind.MultiLineRawStringLiteralToken:
            case SyntaxKind.Utf8MultiLineRawStringLiteralToken:
                return token.GetRequiredParent() is LiteralExpressionSyntax literalExpression
                    ? TryConvertMultiLineRawStringToVirtualChars(token, literalExpression, tokenIncludeDelimiters: true)
                    : default;
 
            case SyntaxKind.InterpolatedStringTextToken:
                {
                    var parent = token.GetRequiredParent();
                    var isFormatClause = parent is InterpolationFormatClauseSyntax;
                    if (isFormatClause)
                        parent = parent.GetRequiredParent();
 
                    var interpolatedString = (InterpolatedStringExpressionSyntax)parent.GetRequiredParent();
 
                    return interpolatedString.StringStartToken.Kind() switch
                    {
                        SyntaxKind.InterpolatedStringStartToken => TryConvertStringToVirtualChars(token, "", "", escapeBraces: true),
                        SyntaxKind.InterpolatedVerbatimStringStartToken => TryConvertVerbatimStringToVirtualChars(token, "", "", escapeBraces: true),
                        SyntaxKind.InterpolatedSingleLineRawStringStartToken => TryConvertSingleLineRawStringToVirtualChars(token),
                        SyntaxKind.InterpolatedMultiLineRawStringStartToken
                            // Format clauses must be single line, even when in a multi-line interpolation.
                            => isFormatClause
                                ? TryConvertSingleLineRawStringToVirtualChars(token)
                                : TryConvertMultiLineRawStringToVirtualChars(token, interpolatedString, tokenIncludeDelimiters: false),
                        _ => default,
                    };
                }
        }
 
        return default;
    }
 
    private static bool IsInDirective(SyntaxNode? node)
    {
        while (node != null)
        {
            if (node is DirectiveTriviaSyntax)
                return true;
 
            node = node.GetParent(ascendOutOfTrivia: true);
        }
 
        return false;
    }
 
    private static VirtualCharGreenSequence TryConvertVerbatimStringToVirtualChars(SyntaxToken token, string startDelimiter, string endDelimiter, bool escapeBraces)
        => TryConvertSimpleDoubleQuoteString(token, startDelimiter, endDelimiter, escapeBraces);
 
    private static VirtualCharGreenSequence TryConvertSingleLineRawStringToVirtualChars(SyntaxToken token)
    {
        var tokenText = token.Text;
 
        var result = ImmutableSegmentedList.CreateBuilder<VirtualCharGreen>();
 
        var startIndexInclusive = 0;
        var endIndexExclusive = tokenText.Length;
 
        if (token.Kind() is SyntaxKind.Utf8SingleLineRawStringLiteralToken)
        {
            Contract.ThrowIfFalse(tokenText is [.., 'u' or 'U', '8']);
            endIndexExclusive -= "u8".Length;
        }
 
        if (token.Kind() is SyntaxKind.SingleLineRawStringLiteralToken or SyntaxKind.Utf8SingleLineRawStringLiteralToken)
        {
            Contract.ThrowIfFalse(tokenText[0] == '"');
 
            while (tokenText[startIndexInclusive] == '"')
            {
                // All quotes should be paired at the end
                Contract.ThrowIfFalse(tokenText[endIndexExclusive - 1] == '"');
                startIndexInclusive++;
                endIndexExclusive--;
            }
        }
 
        for (var index = startIndexInclusive; index < endIndexExclusive;)
            index += ConvertTextAtIndexToVirtualChar(tokenText, index, result);
 
        return CreateVirtualCharSequence(tokenText, startIndexInclusive, endIndexExclusive, result);
    }
 
    /// <summary>
    /// Creates the sequence for the <b>content</b> characters in this <paramref name="token"/>.  This will not
    /// include indentation whitespace that the language specifies is not part of the content.
    /// </summary>
    /// <param name="parentExpression">The containing expression for this token.  This is needed so that we can
    /// determine the indentation whitespace based on the last line of the containing multiline literal.</param>
    /// <param name="tokenIncludeDelimiters">If this token includes the quote (<c>"</c>) characters for the
    /// delimiters inside of it or not.  If so, then those quotes will need to be skipped when determining the
    /// content</param>
    private static VirtualCharGreenSequence TryConvertMultiLineRawStringToVirtualChars(
        SyntaxToken token, ExpressionSyntax parentExpression, bool tokenIncludeDelimiters)
    {
        // if this is the first text content chunk of the multi-line literal.  The first chunk contains the leading
        // indentation of the line it's on (which thus must be trimmed), while all subsequent chunks do not (because
        // they start right after some `{...}` interpolation
        var isFirstChunk =
            parentExpression is LiteralExpressionSyntax ||
            (parentExpression is InterpolatedStringExpressionSyntax { Contents: [var firstContent, ..] } && firstContent == token.GetRequiredParent());
 
        if (parentExpression.GetDiagnostics().Any(d => d.Severity == DiagnosticSeverity.Error))
            return default;
 
        // Use the parent multi-line expression to determine what whitespace to remove from the start of each line.
        var parentSourceText = parentExpression.SyntaxTree.GetText();
        var indentationLength = parentSourceText.Lines.GetLineFromPosition(parentExpression.Span.End).GetFirstNonWhitespaceOffset() ?? 0;
 
        // Create a source-text view over the token.  This makes it very easy to treat the token as a set of lines
        // that can be processed sensibly.
        var tokenSourceText = SourceText.From(token.Text);
 
        // If we're on the very first chunk of the multi-line raw string literal, then we want to start on line 1 so
        // we skip the space and newline that follow the initial `"""`.
        var startLineInclusive = tokenIncludeDelimiters ? 1 : 0;
 
        // Similarly, if we're on the very last chunk of hte multi-line raw string literal, then we don't want to
        // include the line contents for the line that has the final `    """` on it.
        var lastLineExclusive = tokenIncludeDelimiters ? tokenSourceText.Lines.Count - 1 : tokenSourceText.Lines.Count;
 
        var result = ImmutableSegmentedList.CreateBuilder<VirtualCharGreen>();
        for (var lineNumber = startLineInclusive; lineNumber < lastLineExclusive; lineNumber++)
        {
            var currentLine = tokenSourceText.Lines[lineNumber];
            var lineSpan = currentLine.Span;
            var lineStart = lineSpan.Start;
 
            // If we're on the second line onwards, we want to trim the indentation if we have it.  We also always
            // do this for the first line of the first chunk as that will contain the initial leading whitespace.
            if (isFirstChunk || lineNumber > startLineInclusive)
            {
                lineStart = lineSpan.Length > indentationLength
                    ? lineSpan.Start + indentationLength
                    : lineSpan.End;
            }
 
            // The last line of the last chunk does not include the final newline on the line.
            var lineEnd = lineNumber == lastLineExclusive - 1 ? currentLine.End : currentLine.EndIncludingLineBreak;
 
            // Now that we've found the start and end portions of that line, convert all the characters within to
            // virtual chars and return.
            for (var i = lineStart; i < lineEnd;)
                i += ConvertTextAtIndexToVirtualChar(tokenSourceText, i, result);
        }
 
        return VirtualCharGreenSequence.Create(result.ToImmutable());
    }
 
    private static VirtualCharGreenSequence TryConvertStringToVirtualChars(
        SyntaxToken token, string startDelimiter, string endDelimiter, bool escapeBraces)
    {
        var tokenText = token.Text;
        if (startDelimiter.Length > 0 && !tokenText.StartsWith(startDelimiter))
        {
            Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
            return default;
        }
 
        if (endDelimiter.Length > 0 && !tokenText.EndsWith(endDelimiter, StringComparison.OrdinalIgnoreCase))
        {
            Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
            return default;
        }
 
        var startIndexInclusive = startDelimiter.Length;
        var endIndexExclusive = tokenText.Length - endDelimiter.Length;
 
        // Avoid creating and processsing the runes if there are no escapes or surrogates in the string.
        if (!ContainsEscape(tokenText.AsSpan(startIndexInclusive, endIndexExclusive - startIndexInclusive), escapeBraces))
        {
            var sequence = VirtualCharGreenSequence.Create(tokenText);
            return sequence[startIndexInclusive..endIndexExclusive];
        }
        else
        {
            using var pooledRuneResults = s_pooledBuilders.GetPooledObject();
            var charResults = pooledRuneResults.Object;
 
            for (var index = startIndexInclusive; index < endIndexExclusive;)
            {
                var ch = tokenText[index];
                if (ch == '\\')
                {
                    if (TryAddEscape(charResults, tokenText, index) is not int escapeWidth)
                        return default;
 
                    index += escapeWidth;
                }
                else if (escapeBraces && IsOpenOrCloseBrace(ch))
                {
                    if (!IsLegalBraceEscape(tokenText, index, out var braceWidth))
                        return default;
 
                    charResults.Add(new VirtualCharGreen(ch, index, braceWidth));
                    index += braceWidth;
                }
                else
                {
                    charResults.Add(new VirtualCharGreen(ch, index, width: 1));
                    index++;
                }
            }
 
            var sequence = CreateVirtualCharSequence(tokenText, startIndexInclusive, endIndexExclusive, charResults);
            charResults.Clear();
 
            return sequence;
        }
    }
 
    private static bool ContainsEscape(ReadOnlySpan<char> tokenText, bool escapeBraces)
    {
        foreach (var ch in tokenText)
        {
            if (ch == '\\')
                return true;
            else if (escapeBraces && IsOpenOrCloseBrace(ch))
                return true;
        }
 
        return false;
    }
 
    /// <summary>Returns the number of characters consumed.</summary>
    private static int? TryAddEscape(
        ImmutableSegmentedList<VirtualCharGreen>.Builder result,
        string tokenText,
        int index)
    {
        // Copied from Lexer.ScanEscapeSequence.
        Debug.Assert(tokenText[index] == '\\');
 
        return TryAddSingleCharacterEscape(result, tokenText, index) ??
               TryAddMultiCharacterEscape(result, tokenText, index);
    }
 
    public override bool TryGetEscapeCharacter(VirtualChar ch, out char escapedChar)
        => ch.TryGetEscapeCharacter(out escapedChar);
 
    /// <summary>Returns the number of characters consumed.</summary>
    private static int? TryAddSingleCharacterEscape(
        ImmutableSegmentedList<VirtualCharGreen>.Builder result, string tokenText, int index)
    {
        // Copied from Lexer.ScanEscapeSequence.
        Debug.Assert(tokenText[index] == '\\');
 
        var ch = tokenText[index + 1];
 
        // Keep in sync with EscapeForRegularString
        switch (ch)
        {
            // escaped characters that translate to themselves
            case '\'':
            case '"':
            case '\\':
                break;
            // translate escapes as per C# spec 2.4.4.4
            case '0': ch = '\0'; break;
            case 'a': ch = '\a'; break;
            case 'b': ch = '\b'; break;
            case 'e': ch = '\u001b'; break;
            case 'f': ch = '\f'; break;
            case 'n': ch = '\n'; break;
            case 'r': ch = '\r'; break;
            case 't': ch = '\t'; break;
            case 'v': ch = '\v'; break;
            default:
                return null;
        }
 
        result.Add(new VirtualCharGreen(ch, offset: index, width: 2));
        return result.Last().Width;
    }
 
    /// <summary>Returns the number of characters consumed.</summary>
    private static int? TryAddMultiCharacterEscape(
        ImmutableSegmentedList<VirtualCharGreen>.Builder result, string tokenText, int index)
    {
        // Copied from Lexer.ScanEscapeSequence.
        Debug.Assert(tokenText[index] == '\\');
 
        var ch = tokenText[index + 1];
        switch (ch)
        {
            case 'x':
            case 'u':
            case 'U':
                return TryAddMultiCharacterEscape(result, tokenText, index, ch);
            default:
                Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
                return null;
        }
    }
 
    /// <summary>Returns the number of characters consumed.</summary>
    private static int? TryAddMultiCharacterEscape(
        ImmutableSegmentedList<VirtualCharGreen>.Builder result,
        string tokenText,
        int index,
        char character)
    {
        var startIndex = index;
        Debug.Assert(tokenText[index] == '\\');
 
        // skip past the / and the escape type.
        index += 2;
        if (character == 'U')
        {
            // 8 character escape.  May represent 1 or 2 actual chars.
            uint uintChar = 0;
 
            if (!IsHexDigit(tokenText[index]))
            {
                Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
                return null;
            }
 
            for (var i = 0; i < 8; i++)
            {
                character = tokenText[index + i];
                if (!IsHexDigit(character))
                {
                    Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
                    return null;
                }
 
                uintChar = (uint)((uintChar << 4) + HexValue(character));
            }
 
            // Copied from Lexer.cs and SlidingTextWindow.cs
 
            if (uintChar > 0x0010FFFF)
            {
                Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
                return null;
            }
 
            if (uintChar < 0x00010000)
            {
                // something like \U0000000A
                //
                // Represents a single char value.
                result.Add(new VirtualCharGreen((char)uintChar, offset: startIndex, width: 2 + 8));
            }
            else
            {
                Debug.Assert(uintChar is > 0x0000FFFF and <= 0x0010FFFF);
                var lowSurrogate = ((uintChar - 0x00010000) % 0x0400) + 0xDC00;
                var highSurrogate = ((uintChar - 0x00010000) / 0x0400) + 0xD800;
 
                // Encode this as a surrogate pair.  For the purposes of mapping, we'll say the high surrogate maps to
                // the first 6 chars (the \UAAAA in \UAAAABBBB) and the low surrogate maps to the last 4 chars (the BBBB
                // in \UAAAABBBB).
                const string prefix = @"\UAAAA";
                result.Add(new VirtualCharGreen((char)highSurrogate, offset: startIndex, width: prefix.Length));
                result.Add(new VirtualCharGreen((char)lowSurrogate, offset: startIndex + prefix.Length, width: 4));
            }
 
            return @"\UAAAABBBB".Length;
        }
        else if (character == 'u')
        {
            // 4 character escape representing one char.
 
            var intChar = 0;
            if (!IsHexDigit(tokenText[index]))
            {
                Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
                return null;
            }
 
            for (var i = 0; i < 4; i++)
            {
                var ch2 = tokenText[index + i];
                if (!IsHexDigit(ch2))
                {
                    Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
                    return null;
                }
 
                intChar = (intChar << 4) + HexValue(ch2);
            }
 
            character = (char)intChar;
            var width = @"\uAAAA".Length;
            result.Add(new VirtualCharGreen(character, offset: startIndex, width));
            return width;
        }
        else
        {
            Debug.Assert(character == 'x');
            // Variable length (up to 4 chars) hexadecimal escape.
 
            var intChar = 0;
            if (!IsHexDigit(tokenText[index]))
            {
                Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
                return null;
            }
 
            var endIndex = index;
            for (var i = 0; i < 4 && endIndex < tokenText.Length; i++)
            {
                var ch2 = tokenText[index + i];
                if (!IsHexDigit(ch2))
                {
                    // This is possible.  These escape sequences are variable length.
                    break;
                }
 
                intChar = (intChar << 4) + HexValue(ch2);
                endIndex++;
            }
 
            character = (char)intChar;
            var width = endIndex - startIndex;
            result.Add(new VirtualCharGreen(character, offset: startIndex, width));
            return width;
        }
    }
 
    private static int HexValue(char c)
    {
        Debug.Assert(IsHexDigit(c));
        return (c is >= '0' and <= '9') ? c - '0' : (c & 0xdf) - 'A' + 10;
    }
 
    private static bool IsHexDigit(char c)
        => c is (>= '0' and <= '9') or
                (>= 'A' and <= 'F') or
                (>= 'a' and <= 'f');
}
File: src\Workspaces\SharedUtilitiesAndExtensions\Compiler\CSharp\EmbeddedLanguages\VirtualChars\CSharpVirtualCharService.cs	Web Access
Project: src\src\Workspaces\CSharp\Portable\Microsoft.CodeAnalysis.CSharp.Workspaces.csproj (Microsoft.CodeAnalysis.CSharp.Workspaces)