|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Diagnostics;
using System.Linq;
using System.Text;
using Microsoft.CodeAnalysis.Collections;
using Microsoft.CodeAnalysis.CSharp.LanguageService;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.LanguageService;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Shared.Extensions;
using Microsoft.CodeAnalysis.Text;
using Microsoft.CodeAnalysis.Utilities;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.CSharp.EmbeddedLanguages.VirtualChars;
internal class CSharpVirtualCharService : AbstractVirtualCharService
{
public static readonly IVirtualCharService Instance = new CSharpVirtualCharService();
private static readonly ObjectPool<ImmutableSegmentedList<VirtualChar>.Builder> s_pooledBuilders = new(() => ImmutableSegmentedList.CreateBuilder<VirtualChar>());
protected CSharpVirtualCharService()
{
}
protected override ISyntaxFacts SyntaxFacts => CSharpSyntaxFacts.Instance;
protected override bool IsMultiLineRawStringToken(SyntaxToken token)
{
if (token.Kind() is SyntaxKind.MultiLineRawStringLiteralToken or SyntaxKind.Utf8MultiLineRawStringLiteralToken)
return true;
if (token.Parent?.Parent is InterpolatedStringExpressionSyntax { StringStartToken.RawKind: (int)SyntaxKind.InterpolatedMultiLineRawStringStartToken })
return true;
return false;
}
protected override VirtualCharSequence TryConvertToVirtualCharsWorker(SyntaxToken token)
{
// C# preprocessor directives can contain string literals. However, these string literals do not behave
// like normal literals. Because they are used for paths (i.e. in a #line directive), the language does not
// do any escaping within them. i.e. if you have a \ it's just a \ Note that this is not a verbatim
// string. You can't put a double quote in it either, and you cannot have newlines and whatnot.
//
// We technically could convert this trivially to an array of virtual chars. After all, there would just be
// a 1:1 correspondence with the literal contents and the chars returned. However, we don't even both
// returning anything here. That's because there's no useful features we can offer here. Because there are
// no escape characters we won't classify any escape characters. And there is no way that these strings
// would be Regex/Json snippets. So it's easier to just bail out and return nothing.
if (IsInDirective(token.Parent))
return default;
Debug.Assert(!token.ContainsDiagnostics);
switch (token.Kind())
{
case SyntaxKind.CharacterLiteralToken:
return TryConvertStringToVirtualChars(token, "'", "'", escapeBraces: false);
case SyntaxKind.StringLiteralToken:
return token.IsVerbatimStringLiteral()
? TryConvertVerbatimStringToVirtualChars(token, "@\"", "\"", escapeBraces: false)
: TryConvertStringToVirtualChars(token, "\"", "\"", escapeBraces: false);
case SyntaxKind.Utf8StringLiteralToken:
return token.IsVerbatimStringLiteral()
? TryConvertVerbatimStringToVirtualChars(token, "@\"", "\"u8", escapeBraces: false)
: TryConvertStringToVirtualChars(token, "\"", "\"u8", escapeBraces: false);
case SyntaxKind.SingleLineRawStringLiteralToken:
case SyntaxKind.Utf8SingleLineRawStringLiteralToken:
return TryConvertSingleLineRawStringToVirtualChars(token);
case SyntaxKind.MultiLineRawStringLiteralToken:
case SyntaxKind.Utf8MultiLineRawStringLiteralToken:
return token.GetRequiredParent() is LiteralExpressionSyntax literalExpression
? TryConvertMultiLineRawStringToVirtualChars(token, literalExpression, tokenIncludeDelimiters: true)
: default;
case SyntaxKind.InterpolatedStringTextToken:
{
var parent = token.GetRequiredParent();
var isFormatClause = parent is InterpolationFormatClauseSyntax;
if (isFormatClause)
parent = parent.GetRequiredParent();
var interpolatedString = (InterpolatedStringExpressionSyntax)parent.GetRequiredParent();
return interpolatedString.StringStartToken.Kind() switch
{
SyntaxKind.InterpolatedStringStartToken => TryConvertStringToVirtualChars(token, "", "", escapeBraces: true),
SyntaxKind.InterpolatedVerbatimStringStartToken => TryConvertVerbatimStringToVirtualChars(token, "", "", escapeBraces: true),
SyntaxKind.InterpolatedSingleLineRawStringStartToken => TryConvertSingleLineRawStringToVirtualChars(token),
SyntaxKind.InterpolatedMultiLineRawStringStartToken
// Format clauses must be single line, even when in a multi-line interpolation.
=> isFormatClause
? TryConvertSingleLineRawStringToVirtualChars(token)
: TryConvertMultiLineRawStringToVirtualChars(token, interpolatedString, tokenIncludeDelimiters: false),
_ => default,
};
}
}
return default;
}
private static bool IsInDirective(SyntaxNode? node)
{
while (node != null)
{
if (node is DirectiveTriviaSyntax)
return true;
node = node.GetParent(ascendOutOfTrivia: true);
}
return false;
}
private static VirtualCharSequence TryConvertVerbatimStringToVirtualChars(SyntaxToken token, string startDelimiter, string endDelimiter, bool escapeBraces)
=> TryConvertSimpleDoubleQuoteString(token, startDelimiter, endDelimiter, escapeBraces);
private static VirtualCharSequence TryConvertSingleLineRawStringToVirtualChars(SyntaxToken token)
{
var tokenText = token.Text;
var offset = token.SpanStart;
var result = ImmutableSegmentedList.CreateBuilder<VirtualChar>();
var startIndexInclusive = 0;
var endIndexExclusive = tokenText.Length;
if (token.Kind() is SyntaxKind.Utf8SingleLineRawStringLiteralToken)
{
Contract.ThrowIfFalse(tokenText is [.., 'u' or 'U', '8']);
endIndexExclusive -= "u8".Length;
}
if (token.Kind() is SyntaxKind.SingleLineRawStringLiteralToken or SyntaxKind.Utf8SingleLineRawStringLiteralToken)
{
Contract.ThrowIfFalse(tokenText[0] == '"');
while (tokenText[startIndexInclusive] == '"')
{
// All quotes should be paired at the end
Contract.ThrowIfFalse(tokenText[endIndexExclusive - 1] == '"');
startIndexInclusive++;
endIndexExclusive--;
}
}
for (var index = startIndexInclusive; index < endIndexExclusive;)
index += ConvertTextAtIndexToRune(tokenText, index, result, offset);
return CreateVirtualCharSequence(tokenText, offset, startIndexInclusive, endIndexExclusive, result);
}
/// <summary>
/// Creates the sequence for the <b>content</b> characters in this <paramref name="token"/>. This will not
/// include indentation whitespace that the language specifies is not part of the content.
/// </summary>
/// <param name="parentExpression">The containing expression for this token. This is needed so that we can
/// determine the indentation whitespace based on the last line of the containing multiline literal.</param>
/// <param name="tokenIncludeDelimiters">If this token includes the quote (<c>"</c>) characters for the
/// delimiters inside of it or not. If so, then those quotes will need to be skipped when determining the
/// content</param>
private static VirtualCharSequence TryConvertMultiLineRawStringToVirtualChars(
SyntaxToken token, ExpressionSyntax parentExpression, bool tokenIncludeDelimiters)
{
// if this is the first text content chunk of the multi-line literal. The first chunk contains the leading
// indentation of the line it's on (which thus must be trimmed), while all subsequent chunks do not (because
// they start right after some `{...}` interpolation
var isFirstChunk =
parentExpression is LiteralExpressionSyntax ||
(parentExpression is InterpolatedStringExpressionSyntax { Contents: var contents } && contents.First() == token.GetRequiredParent());
if (parentExpression.GetDiagnostics().Any(d => d.Severity == DiagnosticSeverity.Error))
return default;
// Use the parent multi-line expression to determine what whitespace to remove from the start of each line.
var parentSourceText = parentExpression.SyntaxTree.GetText();
var indentationLength = parentSourceText.Lines.GetLineFromPosition(parentExpression.Span.End).GetFirstNonWhitespaceOffset() ?? 0;
// Create a source-text view over the token. This makes it very easy to treat the token as a set of lines
// that can be processed sensibly.
var tokenSourceText = SourceText.From(token.Text);
// If we're on the very first chunk of the multi-line raw string literal, then we want to start on line 1 so
// we skip the space and newline that follow the initial `"""`.
var startLineInclusive = tokenIncludeDelimiters ? 1 : 0;
// Similarly, if we're on the very last chunk of hte multi-line raw string literal, then we don't want to
// include the line contents for the line that has the final ` """` on it.
var lastLineExclusive = tokenIncludeDelimiters ? tokenSourceText.Lines.Count - 1 : tokenSourceText.Lines.Count;
var result = ImmutableSegmentedList.CreateBuilder<VirtualChar>();
for (var lineNumber = startLineInclusive; lineNumber < lastLineExclusive; lineNumber++)
{
var currentLine = tokenSourceText.Lines[lineNumber];
var lineSpan = currentLine.Span;
var lineStart = lineSpan.Start;
// If we're on the second line onwards, we want to trim the indentation if we have it. We also always
// do this for the first line of the first chunk as that will contain the initial leading whitespace.
if (isFirstChunk || lineNumber > startLineInclusive)
{
lineStart = lineSpan.Length > indentationLength
? lineSpan.Start + indentationLength
: lineSpan.End;
}
// The last line of the last chunk does not include the final newline on the line.
var lineEnd = lineNumber == lastLineExclusive - 1 ? currentLine.End : currentLine.EndIncludingLineBreak;
// Now that we've found the start and end portions of that line, convert all the characters within to
// virtual chars and return.
for (var i = lineStart; i < lineEnd;)
i += ConvertTextAtIndexToRune(tokenSourceText, i, result, token.SpanStart);
}
return VirtualCharSequence.Create(result.ToImmutable());
}
private static VirtualCharSequence TryConvertStringToVirtualChars(
SyntaxToken token, string startDelimiter, string endDelimiter, bool escapeBraces)
{
var tokenText = token.Text;
if (startDelimiter.Length > 0 && !tokenText.StartsWith(startDelimiter))
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return default;
}
if (endDelimiter.Length > 0 && !tokenText.EndsWith(endDelimiter, StringComparison.OrdinalIgnoreCase))
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return default;
}
var startIndexInclusive = startDelimiter.Length;
var endIndexExclusive = tokenText.Length - endDelimiter.Length;
// Do things in two passes. First, convert everything in the string to a 16-bit-char+span. Then walk
// again, trying to create Runes from the 16-bit-chars. We do this to simplify complex cases where we may
// have escapes and non-escapes mixed together.
using var _ = ArrayBuilder<(char ch, TextSpan span)>.GetInstance(out var charResults);
// First pass, just convert everything in the string (i.e. escapes) to plain 16-bit characters.
var offset = token.SpanStart;
for (var index = startIndexInclusive; index < endIndexExclusive;)
{
var ch = tokenText[index];
if (ch == '\\')
{
if (!TryAddEscape(charResults, tokenText, offset, index))
return default;
index += charResults.Last().span.Length;
}
else if (escapeBraces && IsOpenOrCloseBrace(ch))
{
if (!IsLegalBraceEscape(tokenText, index, offset, out var braceSpan))
return default;
charResults.Add((ch, braceSpan));
index += charResults.Last().span.Length;
}
else
{
charResults.Add((ch, new TextSpan(offset + index, 1)));
index++;
}
}
return CreateVirtualCharSequence(tokenText, offset, startIndexInclusive, endIndexExclusive, charResults);
}
private static VirtualCharSequence CreateVirtualCharSequence(
string tokenText, int offset, int startIndexInclusive, int endIndexExclusive, ArrayBuilder<(char ch, TextSpan span)> charResults)
{
// Second pass. Convert those characters to Runes.
using var pooledRuneResults = s_pooledBuilders.GetPooledObject();
var runeResults = pooledRuneResults.Object;
try
{
ConvertCharactersToRunes(charResults, runeResults);
return CreateVirtualCharSequence(tokenText, offset, startIndexInclusive, endIndexExclusive, runeResults);
}
finally
{
// Ensure the builder is cleared out before releasing back to the pool.
runeResults.Clear();
}
}
private static void ConvertCharactersToRunes(ArrayBuilder<(char ch, TextSpan span)> charResults, ImmutableSegmentedList<VirtualChar>.Builder runeResults)
{
for (var i = 0; i < charResults.Count;)
{
var (ch, span) = charResults[i];
// First, see if this was a valid single char that can become a Rune.
if (Rune.TryCreate(ch, out var rune))
{
runeResults.Add(VirtualChar.Create(rune, span));
i++;
continue;
}
// Next, see if we got at least a surrogate pair that can be converted into a Rune.
if (i + 1 < charResults.Count)
{
var (nextCh, nextSpan) = charResults[i + 1];
if (Rune.TryCreate(ch, nextCh, out rune))
{
runeResults.Add(VirtualChar.Create(rune, TextSpan.FromBounds(span.Start, nextSpan.End)));
i += 2;
continue;
}
}
// Had an unpaired surrogate.
Debug.Assert(char.IsSurrogate(ch));
runeResults.Add(VirtualChar.Create(ch, span));
i++;
}
}
private static bool TryAddEscape(
ArrayBuilder<(char ch, TextSpan span)> result, string tokenText, int offset, int index)
{
// Copied from Lexer.ScanEscapeSequence.
Debug.Assert(tokenText[index] == '\\');
return TryAddSingleCharacterEscape(result, tokenText, offset, index) ||
TryAddMultiCharacterEscape(result, tokenText, offset, index);
}
public override bool TryGetEscapeCharacter(VirtualChar ch, out char escapedChar)
=> ch.TryGetEscapeCharacter(out escapedChar);
private static bool TryAddSingleCharacterEscape(
ArrayBuilder<(char ch, TextSpan span)> result, string tokenText, int offset, int index)
{
// Copied from Lexer.ScanEscapeSequence.
Debug.Assert(tokenText[index] == '\\');
var ch = tokenText[index + 1];
// Keep in sync with EscapeForRegularString
switch (ch)
{
// escaped characters that translate to themselves
case '\'':
case '"':
case '\\':
break;
// translate escapes as per C# spec 2.4.4.4
case '0': ch = '\0'; break;
case 'a': ch = '\a'; break;
case 'b': ch = '\b'; break;
case 'e': ch = '\u001b'; break;
case 'f': ch = '\f'; break;
case 'n': ch = '\n'; break;
case 'r': ch = '\r'; break;
case 't': ch = '\t'; break;
case 'v': ch = '\v'; break;
default:
return false;
}
result.Add((ch, new TextSpan(offset + index, 2)));
return true;
}
private static bool TryAddMultiCharacterEscape(
ArrayBuilder<(char ch, TextSpan span)> result, string tokenText, int offset, int index)
{
// Copied from Lexer.ScanEscapeSequence.
Debug.Assert(tokenText[index] == '\\');
var ch = tokenText[index + 1];
switch (ch)
{
case 'x':
case 'u':
case 'U':
return TryAddMultiCharacterEscape(result, tokenText, offset, index, ch);
default:
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
}
}
private static bool TryAddMultiCharacterEscape(
ArrayBuilder<(char ch, TextSpan span)> result, string tokenText, int offset, int index, char character)
{
var startIndex = index;
Debug.Assert(tokenText[index] == '\\');
// skip past the / and the escape type.
index += 2;
if (character == 'U')
{
// 8 character escape. May represent 1 or 2 actual chars.
uint uintChar = 0;
if (!IsHexDigit(tokenText[index]))
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
}
for (var i = 0; i < 8; i++)
{
character = tokenText[index + i];
if (!IsHexDigit(character))
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
}
uintChar = (uint)((uintChar << 4) + HexValue(character));
}
// Copied from Lexer.cs and SlidingTextWindow.cs
if (uintChar > 0x0010FFFF)
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
}
if (uintChar < 0x00010000)
{
// something like \U0000000A
//
// Represents a single char value.
result.Add(((char)uintChar, new TextSpan(startIndex + offset, 2 + 8)));
return true;
}
else
{
Debug.Assert(uintChar is > 0x0000FFFF and <= 0x0010FFFF);
var lowSurrogate = ((uintChar - 0x00010000) % 0x0400) + 0xDC00;
var highSurrogate = ((uintChar - 0x00010000) / 0x0400) + 0xD800;
// Encode this as a surrogate pair.
var pos = startIndex + offset;
result.Add(((char)highSurrogate, new TextSpan(pos, 0)));
result.Add(((char)lowSurrogate, new TextSpan(pos, 2 + 8)));
return true;
}
}
else if (character == 'u')
{
// 4 character escape representing one char.
var intChar = 0;
if (!IsHexDigit(tokenText[index]))
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
}
for (var i = 0; i < 4; i++)
{
var ch2 = tokenText[index + i];
if (!IsHexDigit(ch2))
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
}
intChar = (intChar << 4) + HexValue(ch2);
}
character = (char)intChar;
result.Add((character, new TextSpan(startIndex + offset, 2 + 4)));
return true;
}
else
{
Debug.Assert(character == 'x');
// Variable length (up to 4 chars) hexadecimal escape.
var intChar = 0;
if (!IsHexDigit(tokenText[index]))
{
Debug.Fail("This should not be reachable as long as the compiler added no diagnostics.");
return false;
}
var endIndex = index;
for (var i = 0; i < 4 && endIndex < tokenText.Length; i++)
{
var ch2 = tokenText[index + i];
if (!IsHexDigit(ch2))
{
// This is possible. These escape sequences are variable length.
break;
}
intChar = (intChar << 4) + HexValue(ch2);
endIndex++;
}
character = (char)intChar;
result.Add((character, TextSpan.FromBounds(startIndex + offset, endIndex + offset)));
return true;
}
}
private static int HexValue(char c)
{
Debug.Assert(IsHexDigit(c));
return (c is >= '0' and <= '9') ? c - '0' : (c & 0xdf) - 'A' + 10;
}
private static bool IsHexDigit(char c)
{
return c is >= '0' and <= '9' or
>= 'A' and <= 'F' or
>= 'a' and <= 'f';
}
}
|