|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
using Microsoft.CodeAnalysis.EmbeddedLanguages.Common;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Text;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis.Features.EmbeddedLanguages.Json;
using static EmbeddedSyntaxHelpers;
using static JsonHelpers;
using JsonNodeOrToken = EmbeddedSyntaxNodeOrToken<JsonKind, JsonNode>;
using JsonToken = EmbeddedSyntaxToken<JsonKind>;
using JsonTrivia = EmbeddedSyntaxTrivia<JsonKind>;
using JsonSeparatedList = EmbeddedSeparatedSyntaxNodeList<JsonKind, JsonNode, JsonValueNode>;
/// <summary>
/// Parser used for reading in a sequence of <see cref="VirtualChar"/>s, and producing a <see
/// cref="JsonTree"/> out of it. Parsing will always succeed (except in the case of a
/// stack-overflow) and will consume the entire sequence of chars. General roslyn syntax
/// principles are held to (i.e. immutable, fully representative, etc.).
/// <para>
/// The parser always parses out the same tree regardless of input. *However*, depending on the
/// flags passed to it, it may return a different set of *diagnostics*. Specifically, the
/// parser supports json.net parsing and strict RFC8259 (https://tools.ietf.org/html/rfc8259).
/// As such, the parser supports a superset of both, but then does a pass at the end to produce
/// appropriate diagnostics.
/// </para>
/// </summary>
/// <remarks>
/// Note: the json structure we parse out is actually very simple. It's effectively all lists
/// of <see cref="JsonValueNode"/> values. We just treat almost everything as a 'value'. For
/// example, a <see cref="JsonPropertyNode"/> (i.e. <c>"x" = 0</c>) is a 'value'. As such, it
/// can show up in arrays (i.e. <c>["x" = 0, "y" = 1]</c>). This is not legal, but it greatly
/// simplifies parsing. Effectively, we just have recursive list parsing, where we accept any
/// sort of value in any sort of context. A later pass will then report errors for the wrong
/// sorts of values showing up in incorrect contexts.
/// <para>
/// Note: We also treat commas (<c>,</c>) as being a 'value' on its own. This simplifies parsing
/// by allowing us to not have to represent Lists and SeparatedLists. It also helps model
/// things that are supported in json.net (like <c>[1,,2]</c>). Our post-parsing pass will
/// then ensure that these comma-values only show up in the right contexts.
/// </para>
/// </remarks>
[NonCopyable]
internal partial struct JsonParser
{
private static readonly string s_closeBracketExpected = string.Format(FeaturesResources._0_expected, ']');
private static readonly string s_closeBraceExpected = string.Format(FeaturesResources._0_expected, '}');
private static readonly string s_openParenExpected = string.Format(FeaturesResources._0_expected, '(');
private static readonly string s_closeParenExpected = string.Format(FeaturesResources._0_expected, ')');
private static readonly string s_commaExpected = string.Format(FeaturesResources._0_expected, ',');
private JsonLexer _lexer;
private JsonToken _currentToken;
private int _recursionDepth;
// Fields used to keep track of what types of json values we're in. They're used for error
// recovery, specifically with respect to encountering unexpected tokens while parsing out a
// sequence of values. For example, if we have: ```{ a: [1, 2, }```, we will mark that
// we're both in an object and in an array. When we then encounter the errant ```}```,
// we'll see that we were in an object, and thus should stop parsing out the sequence for
// the array so that the ```}``` can be consume by the object we were in. However, if we
// just had ```[1, 2, }```, we would not be in an object, and we would just consume the
// ```}``` as a bogus value inside the array.
//
// This approach of keeping track of the parse contexts we're in, and using them to
// determine if we should consume or pop-out when encountering an error token, mirrors the
// same approach that we use in the C# and TS/JS parsers.
private bool _inObject;
private bool _inArray;
private bool _inConstructor;
private JsonParser(VirtualCharSequence text) : this()
{
_lexer = new JsonLexer(text);
// Get the first token.
ConsumeCurrentToken();
}
/// <summary>
/// Returns the latest token the lexer has produced, and then asks the lexer to
/// produce the next token after that.
/// </summary>
private JsonToken ConsumeCurrentToken()
{
var previous = _currentToken;
_currentToken = _lexer.ScanNextToken();
return previous;
}
/// <summary>
/// Given an input text, parses out a fully representative syntax tree and list of
/// diagnostics. Parsing should always succeed, except in the case of the stack
/// overflowing.
/// </summary>
public static JsonTree? TryParse(VirtualCharSequence text, JsonOptions options)
{
try
{
if (text.IsDefaultOrEmpty)
return null;
return new JsonParser(text).ParseTree(options);
}
catch (InsufficientExecutionStackException)
{
return null;
}
}
private JsonTree ParseTree(JsonOptions options)
{
var arraySequence = this.ParseSequence();
Debug.Assert(_lexer.Position == _lexer.Text.Length);
Debug.Assert(_currentToken.Kind == JsonKind.EndOfFile);
var root = new JsonCompilationUnit(arraySequence, _currentToken);
// There are three forms of diagnostics we can detect. The first were generated directly when parsing and
// relate to unknown tokens encountered or tokens that were needed but not found. The second relates to a
// set of grammar check rules that apply to both strict and non-strict json. The last is the specific
// strict/loose checks we perform. We look for all three forms, but only report the first issue we found.
// We want to avoid reporting a ton of cascaded errors.
var diagnostic1 = GetFirstDiagnostic(root);
var diagnostic2 = CheckTopLevel(_lexer.Text, root);
var diagnostic3 = options.HasFlag(JsonOptions.Strict)
? StrictSyntaxChecker.CheckRootSyntax(root, options)
: JsonNetSyntaxChecker.CheckSyntax(root);
var diagnostic = Earliest(Earliest(diagnostic1, diagnostic2), diagnostic3);
return new JsonTree(_lexer.Text, root, diagnostic == null
? []
: [diagnostic.Value]);
}
private static EmbeddedDiagnostic? Earliest(EmbeddedDiagnostic? d1, EmbeddedDiagnostic? d2)
{
if (d1 == null)
return d2;
if (d2 == null)
return d1;
return d1.Value.Span.Start <= d2.Value.Span.Start ? d1 : d2;
}
/// <summary>
/// Checks for errors in json for both json.net and strict mode.
/// </summary>
private static EmbeddedDiagnostic? CheckTopLevel(
VirtualCharSequence text, JsonCompilationUnit compilationUnit)
{
var sequence = compilationUnit.Sequence;
if (sequence.IsEmpty)
{
// json is not allowed to be just whitespace.
//
// Note: we always have at least some content (either real nodes in the tree) or trivia on the EOF token
// as we only parse when we have a non-empty sequence of virtual chars to begin with.
if (text.Length > 0 &&
compilationUnit.EndOfFileToken.LeadingTrivia.All(
t => t.Kind is JsonKind.WhitespaceTrivia or JsonKind.EndOfLineTrivia))
{
return new EmbeddedDiagnostic(FeaturesResources.Syntax_error, GetSpan(text));
}
return null;
}
else if (sequence.Length >= 2)
{
// the top level can't have more than one actual value.
var firstToken = GetFirstToken(sequence[1]);
return new EmbeddedDiagnostic(
string.Format(FeaturesResources._0_unexpected, firstToken.VirtualChars[0]),
firstToken.GetSpan());
}
else
{
var child = sequence.Single();
// Commas should never show up in the top level sequence.
if (child.Kind == JsonKind.CommaValue)
{
var emptyValue = (JsonCommaValueNode)child;
return new EmbeddedDiagnostic(
string.Format(FeaturesResources._0_unexpected, ','),
emptyValue.CommaToken.GetSpan());
}
else if (child.Kind == JsonKind.Property)
{
var propertyValue = (JsonPropertyNode)child;
return new EmbeddedDiagnostic(
string.Format(FeaturesResources._0_unexpected, ':'),
propertyValue.ColonToken.GetSpan());
}
return CheckSyntax(child);
}
static EmbeddedDiagnostic? CheckSyntax(JsonNode node)
{
var diagnostic = node.Kind switch
{
JsonKind.Array => CheckArray((JsonArrayNode)node),
JsonKind.Object => CheckObject((JsonObjectNode)node),
_ => null,
};
return Earliest(diagnostic, CheckChildren(node));
}
static EmbeddedDiagnostic? CheckChildren(JsonNode node)
{
foreach (var child in node)
{
if (child.IsNode)
{
var diagnostic = CheckSyntax(child.Node);
if (diagnostic != null)
return diagnostic;
}
}
return null;
}
static EmbeddedDiagnostic? CheckArray(JsonArrayNode node)
{
foreach (var child in node.Sequence)
{
if (child.Kind == JsonKind.Property)
{
return new EmbeddedDiagnostic(
FeaturesResources.Properties_not_allowed_in_an_array,
((JsonPropertyNode)child).ColonToken.GetSpan());
}
}
return null;
}
static EmbeddedDiagnostic? CheckObject(JsonObjectNode node)
{
foreach (var child in node.Sequence)
{
if (child is JsonLiteralNode { LiteralToken.Kind: JsonKind.StringToken })
{
return new EmbeddedDiagnostic(
FeaturesResources.Property_name_must_be_followed_by_a_colon,
child.GetSpan());
}
}
return null;
}
}
private static JsonToken GetFirstToken(JsonNodeOrToken nodeOrToken)
=> nodeOrToken.IsNode ? GetFirstToken(nodeOrToken.Node.ChildAt(0)) : nodeOrToken.Token;
private static EmbeddedDiagnostic? GetFirstDiagnostic(JsonNode node)
{
foreach (var child in node)
{
var diagnostic = GetFirstDiagnostic(child);
if (diagnostic != null)
return diagnostic;
}
return null;
}
private static EmbeddedDiagnostic? GetFirstDiagnostic(JsonNodeOrToken child)
=> child.IsNode
? GetFirstDiagnostic(child.Node)
: GetFirstDiagnostic(child.Token);
private static EmbeddedDiagnostic? GetFirstDiagnostic(JsonToken token)
=> GetFirstDiagnostic(token.LeadingTrivia) ?? token.Diagnostics.FirstOrNull() ?? GetFirstDiagnostic(token.TrailingTrivia);
private static EmbeddedDiagnostic? GetFirstDiagnostic(ImmutableArray<JsonTrivia> list)
{
foreach (var trivia in list)
{
var diagnostic = trivia.Diagnostics.FirstOrNull();
if (diagnostic != null)
return diagnostic;
}
return null;
}
private ImmutableArray<JsonValueNode> ParseSequence()
{
try
{
_recursionDepth++;
StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
return ParseSequenceWorker();
}
finally
{
_recursionDepth--;
}
}
private ImmutableArray<JsonValueNode> ParseSequenceWorker()
{
using var _ = ArrayBuilder<JsonValueNode>.GetInstance(out var result);
while (ShouldConsumeSequenceElement())
result.Add(ParseValue());
return result.ToImmutableAndClear();
}
private JsonSeparatedList ParseCommaSeparatedSequence()
{
try
{
_recursionDepth++;
StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
return ParseCommaSeparatedSequenceWorker();
}
finally
{
_recursionDepth--;
}
}
private JsonSeparatedList ParseCommaSeparatedSequenceWorker()
{
using var _ = ArrayBuilder<JsonNodeOrToken>.GetInstance(out var result);
var allProperties = true;
while (ShouldConsumeSequenceElement())
{
var value = ParseValue();
allProperties = allProperties && value.Kind == JsonKind.Property;
result.Add(value);
// Try to consume a comma. If we don't see one, consume an empty one as a placeholder. Create a
// diagnostic message depending on if we've seen only properties before this point. If not, don't give
// a message about a missing comma. Instead, we'll give a specific message that we didn't get a
// property when we expected one.
if (ShouldConsumeSequenceElement())
result.Add(ConsumeToken(JsonKind.CommaToken, allProperties ? s_commaExpected : null));
}
return new JsonSeparatedList(result.ToImmutable());
}
private readonly bool ShouldConsumeSequenceElement()
=> _currentToken.Kind switch
{
JsonKind.EndOfFile => false,
JsonKind.CloseBraceToken => !_inObject,
JsonKind.CloseBracketToken => !_inArray,
JsonKind.CloseParenToken => !_inConstructor,
_ => true
};
private JsonValueNode ParseValue()
{
try
{
_recursionDepth++;
StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
return _currentToken.Kind switch
{
JsonKind.OpenBraceToken => ParseObject(),
JsonKind.OpenBracketToken => ParseArray(),
JsonKind.CommaToken => ParseCommaValue(),
_ => ParseLiteralOrPropertyOrConstructor(),
};
}
finally
{
_recursionDepth--;
}
}
private static void SplitLiteral(JsonToken literalToken, out JsonToken minusToken, out JsonToken newLiteralToken)
{
minusToken = CreateToken(
JsonKind.MinusToken, literalToken.LeadingTrivia,
literalToken.VirtualChars.GetSubSequence(new TextSpan(0, 1)),
[]);
newLiteralToken = CreateToken(
literalToken.Kind,
[],
literalToken.VirtualChars.GetSubSequence(TextSpan.FromBounds(1, literalToken.VirtualChars.Length)),
literalToken.TrailingTrivia,
literalToken.Diagnostics);
}
private JsonPropertyNode ParseProperty(JsonToken stringLiteralOrText)
{
Debug.Assert(_currentToken.Kind == JsonKind.ColonToken);
// Kind could be anything else we might have parsed as a value (for example, an integer/boolean literal).
if (stringLiteralOrText.Kind != JsonKind.StringToken)
stringLiteralOrText = stringLiteralOrText.With(kind: JsonKind.TextToken);
var colonToken = ConsumeCurrentToken();
// Newtonsoft allows "{ a: , }" as a legal property. In that case, synthesize a missing value and allow the
// comma to be parsed as the next value in the sequence. The strict pass will error if it sees this missing
// comma-value as the value of a property.
if (_currentToken.Kind == JsonKind.CommaToken)
{
return new JsonPropertyNode(
stringLiteralOrText, colonToken,
new JsonCommaValueNode(CreateMissingToken(JsonKind.CommaToken)));
}
else if (_currentToken.Kind == JsonKind.EndOfFile)
{
return new JsonPropertyNode(
stringLiteralOrText, colonToken,
new JsonCommaValueNode(CreateMissingToken(JsonKind.CommaToken).AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Missing_property_value,
GetTokenStartPositionSpan(_currentToken)))));
}
var value = ParseValue();
if (value.Kind == JsonKind.Property)
{
// It's always illegal to have something like ```"a" : "b" : 1```
var nestedProperty = (JsonPropertyNode)value;
value = new JsonPropertyNode(
nestedProperty.NameToken,
nestedProperty.ColonToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
FeaturesResources.Nested_properties_not_allowed,
nestedProperty.ColonToken.GetSpan())),
nestedProperty.Value);
}
return new JsonPropertyNode(stringLiteralOrText, colonToken, value);
}
private JsonValueNode ParseLiteralOrPropertyOrConstructor()
{
var textToken = ConsumeCurrentToken();
return _currentToken.Kind == JsonKind.ColonToken
? ParseProperty(textToken)
: ParseLiteralOrTextOrConstructor(textToken);
}
private JsonValueNode ParseLiteralOrTextOrConstructor(JsonToken token)
{
if (token.Kind == JsonKind.StringToken)
return new JsonLiteralNode(token);
// Look for constructors (a json.net extension). We'll report them as an error
// in strict model.
if (Matches(token, "new"))
return ParseConstructor(token);
// Check for certain literal values. Some of these (like NaN) are json.net only.
// We'll check for these later in the strict-mode pass.
Debug.Assert(token.VirtualChars.Length > 0);
if (TryMatch(token, "NaN", JsonKind.NaNLiteralToken, out var newKind) ||
TryMatch(token, "null", JsonKind.NullLiteralToken, out newKind) ||
TryMatch(token, "true", JsonKind.TrueLiteralToken, out newKind) ||
TryMatch(token, "false", JsonKind.FalseLiteralToken, out newKind) ||
TryMatch(token, "Infinity", JsonKind.InfinityLiteralToken, out newKind) ||
TryMatch(token, "undefined", JsonKind.UndefinedLiteralToken, out newKind))
{
return new JsonLiteralNode(token.With(kind: newKind));
}
if (Matches(token, "-Infinity"))
{
SplitLiteral(token, out var minusToken, out var newLiteralToken);
return new JsonNegativeLiteralNode(
minusToken, newLiteralToken.With(kind: JsonKind.InfinityLiteralToken));
}
var firstChar = token.VirtualChars[0];
if (firstChar == '-' || firstChar == '.' || IsDigit(firstChar))
return new JsonLiteralNode(token.With(kind: JsonKind.NumberToken));
return new JsonTextNode(
token.With(kind: JsonKind.TextToken).AddDiagnosticIfNone(new EmbeddedDiagnostic(
string.Format(FeaturesResources._0_unexpected, firstChar.ToString()),
firstChar.Span)));
}
private JsonConstructorNode ParseConstructor(JsonToken token)
{
var savedInConstructor = _inConstructor;
_inConstructor = true;
var result = new JsonConstructorNode(
token.With(kind: JsonKind.NewKeyword),
ConsumeToken(JsonKind.TextToken, FeaturesResources.Name_expected),
ConsumeToken(JsonKind.OpenParenToken, s_openParenExpected),
ParseSequence(),
ConsumeToken(JsonKind.CloseParenToken, s_closeParenExpected));
_inConstructor = savedInConstructor;
return result;
}
private static bool TryMatch(JsonToken token, string val, JsonKind kind, out JsonKind newKind)
{
if (Matches(token, val))
{
newKind = kind;
return true;
}
newKind = default;
return false;
}
private static bool Matches(JsonToken token, string val)
{
var chars = token.VirtualChars;
if (chars.Length != val.Length)
return false;
for (var i = 0; i < val.Length; i++)
{
if (chars[i] != val[i])
return false;
}
return true;
}
private static bool IsDigit(VirtualChar ch)
=> ch.Value is >= '0' and <= '9';
private JsonCommaValueNode ParseCommaValue()
=> new(ConsumeCurrentToken());
private JsonArrayNode ParseArray()
{
var savedInArray = _inArray;
_inArray = true;
var result = new JsonArrayNode(
ConsumeCurrentToken(),
ParseSequence(),
ConsumeToken(JsonKind.CloseBracketToken, s_closeBracketExpected));
_inArray = savedInArray;
return result;
}
private JsonObjectNode ParseObject()
{
var savedInObject = _inObject;
_inObject = true;
var result = new JsonObjectNode(
ConsumeCurrentToken(),
ParseCommaSeparatedSequence(),
ConsumeToken(JsonKind.CloseBraceToken, s_closeBraceExpected));
_inObject = savedInObject;
return result;
}
private JsonToken ConsumeToken(JsonKind kind, string? error)
{
if (_currentToken.Kind == kind)
return ConsumeCurrentToken();
var result = CreateMissingToken(kind);
if (error == null)
return result;
return result.AddDiagnosticIfNone(new EmbeddedDiagnostic(error, GetTokenStartPositionSpan(_currentToken)));
}
private readonly TextSpan GetTokenStartPositionSpan(JsonToken token)
=> token.Kind == JsonKind.EndOfFile
? new TextSpan(_lexer.Text.Last().Span.End, 0)
: new TextSpan(token.VirtualChars[0].Span.Start, 0);
}
|