JsonParser.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
using Microsoft.CodeAnalysis.EmbeddedLanguages.Common;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Text;
using Roslyn.Utilities;
 
namespace Microsoft.CodeAnalysis.Features.EmbeddedLanguages.Json;
 
using static EmbeddedSyntaxHelpers;
using static JsonHelpers;
using JsonNodeOrToken = EmbeddedSyntaxNodeOrToken<JsonKind, JsonNode>;
using JsonSeparatedList = EmbeddedSeparatedSyntaxNodeList<JsonKind, JsonNode, JsonValueNode>;
using JsonToken = EmbeddedSyntaxToken<JsonKind>;
using JsonTrivia = EmbeddedSyntaxTrivia<JsonKind>;
 
/// <summary>
/// Parser used for reading in a sequence of <see cref="VirtualChar"/>s, and producing a <see
/// cref="JsonTree"/> out of it. Parsing will always succeed (except in the case of a
/// stack-overflow) and will consume the entire sequence of chars.  General roslyn syntax
/// principles are held to (i.e. immutable, fully representative, etc.).
/// <para>
/// The parser always parses out the same tree regardless of input.  *However*, depending on the
/// flags passed to it, it may return a different set of *diagnostics*.  Specifically, the
/// parser supports json.net parsing and strict RFC8259 (https://tools.ietf.org/html/rfc8259).
/// As such, the parser supports a superset of both, but then does a pass at the end to produce
/// appropriate diagnostics.
/// </para>
/// </summary>
/// <remarks>
/// Note: the json structure we parse out is actually very simple.  It's effectively all lists
/// of <see cref="JsonValueNode"/> values.  We just treat almost everything as a 'value'.  For
/// example, a <see cref="JsonPropertyNode"/> (i.e. <c>"x" = 0</c>) is a 'value'.  As such, it
/// can show up in arrays (i.e.  <c>["x" = 0, "y" = 1]</c>).  This is not legal, but it greatly
/// simplifies parsing.  Effectively, we just have recursive list parsing, where we accept any
/// sort of value in any sort of context.  A later pass will then report errors for the wrong
/// sorts of values showing up in incorrect contexts.
/// <para>
/// Note: We also treat commas (<c>,</c>) as being a 'value' on its own.  This simplifies parsing
/// by allowing us to not have to represent Lists and SeparatedLists.  It also helps model
/// things that are supported in json.net (like <c>[1,,2]</c>).  Our post-parsing pass will
/// then ensure that these comma-values only show up in the right contexts.
/// </para>
/// </remarks>
[NonCopyable]
internal partial struct JsonParser
{
    private static readonly string s_closeBracketExpected = string.Format(FeaturesResources._0_expected, ']');
    private static readonly string s_closeBraceExpected = string.Format(FeaturesResources._0_expected, '}');
    private static readonly string s_openParenExpected = string.Format(FeaturesResources._0_expected, '(');
    private static readonly string s_closeParenExpected = string.Format(FeaturesResources._0_expected, ')');
    private static readonly string s_commaExpected = string.Format(FeaturesResources._0_expected, ',');
 
    private JsonLexer _lexer;
    private JsonToken _currentToken;
    private int _recursionDepth;
 
    // Fields used to keep track of what types of json values we're in.  They're used for error
    // recovery, specifically with respect to encountering unexpected tokens while parsing out a
    // sequence of values.  For example, if we have:  ```{ a: [1, 2, }```, we will mark that
    // we're both in an object and in an array.  When we then encounter the errant ```}```,
    // we'll see that we were in an object, and thus should stop parsing out the sequence for
    // the array so that the ```}``` can be consume by the object we were in.  However, if we
    // just had ```[1, 2, }```, we would not be in an object, and we would just consume the
    // ```}``` as a bogus value inside the array.
    //
    // This approach of keeping track of the parse contexts we're in, and using them to
    // determine if we should consume or pop-out when encountering an error token, mirrors the
    // same approach that we use in the C# and TS/JS parsers.
    private bool _inObject;
    private bool _inArray;
    private bool _inConstructor;
 
    private JsonParser(VirtualCharSequence text) : this()
    {
        _lexer = new JsonLexer(text);
 
        // Get the first token.
        ConsumeCurrentToken();
    }
 
    /// <summary>
    /// Returns the latest token the lexer has produced, and then asks the lexer to 
    /// produce the next token after that.
    /// </summary>
    private JsonToken ConsumeCurrentToken()
    {
        var previous = _currentToken;
        _currentToken = _lexer.ScanNextToken();
        return previous;
    }
 
    /// <summary>
    /// Given an input text, parses out a fully representative syntax tree  and list of 
    /// diagnostics.  Parsing should always succeed, except in the case of the stack 
    /// overflowing.
    /// </summary>
    public static JsonTree? TryParse(VirtualCharSequence text, JsonOptions options)
    {
        try
        {
            if (text.IsDefaultOrEmpty)
                return null;
 
            return new JsonParser(text).ParseTree(options);
        }
        catch (InsufficientExecutionStackException)
        {
            return null;
        }
    }
 
    private JsonTree ParseTree(JsonOptions options)
    {
        var arraySequence = this.ParseSequence();
        Debug.Assert(_lexer.Position == _lexer.Text.Length);
        Debug.Assert(_currentToken.Kind == JsonKind.EndOfFile);
 
        var root = new JsonCompilationUnit(arraySequence, _currentToken);
 
        // There are three forms of diagnostics we can detect.  The first were generated directly when parsing and
        // relate to unknown tokens encountered or tokens that were needed but not found.  The second relates to a
        // set of grammar check rules that apply to both strict and non-strict json.  The last is the specific
        // strict/loose checks we perform.  We look for all three forms, but only report the first issue we found.
        // We want to avoid reporting a ton of cascaded errors.
        var diagnostic1 = GetFirstDiagnostic(root);
        var diagnostic2 = CheckTopLevel(_lexer.Text, root);
        var diagnostic3 = options.HasFlag(JsonOptions.Strict)
            ? StrictSyntaxChecker.CheckRootSyntax(root, options)
            : JsonNetSyntaxChecker.CheckSyntax(root);
 
        var diagnostic = Earliest(Earliest(diagnostic1, diagnostic2), diagnostic3);
 
        return new JsonTree(_lexer.Text, root, diagnostic == null
            ? []
            : [diagnostic.Value]);
    }
 
    private static EmbeddedDiagnostic? Earliest(EmbeddedDiagnostic? d1, EmbeddedDiagnostic? d2)
    {
        if (d1 == null)
            return d2;
 
        if (d2 == null)
            return d1;
 
        return d1.Value.Span.Start <= d2.Value.Span.Start ? d1 : d2;
    }
 
    /// <summary>
    /// Checks for errors in json for both json.net and strict mode.
    /// </summary>
    private static EmbeddedDiagnostic? CheckTopLevel(
        VirtualCharSequence text, JsonCompilationUnit compilationUnit)
    {
        var sequence = compilationUnit.Sequence;
        if (sequence.IsEmpty)
        {
            // json is not allowed to be just whitespace.
            //
            // Note: we always have at least some content (either real nodes in the tree) or trivia on the EOF token
            // as we only parse when we have a non-empty sequence of virtual chars to begin with.
            if (text.Length > 0 &&
                compilationUnit.EndOfFileToken.LeadingTrivia.All(
                    t => t.Kind is JsonKind.WhitespaceTrivia or JsonKind.EndOfLineTrivia))
            {
                return new EmbeddedDiagnostic(FeaturesResources.Syntax_error, GetSpan(text));
            }
 
            return null;
        }
        else if (sequence.Length >= 2)
        {
            // the top level can't have more than one actual value.
            var firstToken = GetFirstToken(sequence[1]);
            return new EmbeddedDiagnostic(
                string.Format(FeaturesResources._0_unexpected, firstToken.VirtualChars[0]),
                firstToken.GetSpan());
        }
        else
        {
            var child = sequence.Single();
 
            // Commas should never show up in the top level sequence.
            if (child.Kind == JsonKind.CommaValue)
            {
                var emptyValue = (JsonCommaValueNode)child;
                return new EmbeddedDiagnostic(
                    string.Format(FeaturesResources._0_unexpected, ','),
                    emptyValue.CommaToken.GetSpan());
            }
            else if (child.Kind == JsonKind.Property)
            {
                var propertyValue = (JsonPropertyNode)child;
                return new EmbeddedDiagnostic(
                    string.Format(FeaturesResources._0_unexpected, ':'),
                    propertyValue.ColonToken.GetSpan());
            }
 
            return CheckSyntax(child);
        }
 
        static EmbeddedDiagnostic? CheckSyntax(JsonNode node)
        {
            var diagnostic = node.Kind switch
            {
                JsonKind.Array => CheckArray((JsonArrayNode)node),
                JsonKind.Object => CheckObject((JsonObjectNode)node),
                _ => null,
            };
 
            return Earliest(diagnostic, CheckChildren(node));
        }
 
        static EmbeddedDiagnostic? CheckChildren(JsonNode node)
        {
            foreach (var child in node)
            {
                if (child.IsNode)
                {
                    var diagnostic = CheckSyntax(child.Node);
                    if (diagnostic != null)
                        return diagnostic;
                }
            }
 
            return null;
        }
 
        static EmbeddedDiagnostic? CheckArray(JsonArrayNode node)
        {
            foreach (var child in node.Sequence)
            {
                if (child.Kind == JsonKind.Property)
                {
                    return new EmbeddedDiagnostic(
                        FeaturesResources.Properties_not_allowed_in_an_array,
                        ((JsonPropertyNode)child).ColonToken.GetSpan());
                }
            }
 
            return null;
        }
 
        static EmbeddedDiagnostic? CheckObject(JsonObjectNode node)
        {
            foreach (var child in node.Sequence)
            {
                if (child is JsonLiteralNode { LiteralToken.Kind: JsonKind.StringToken })
                {
                    return new EmbeddedDiagnostic(
                        FeaturesResources.Property_name_must_be_followed_by_a_colon,
                        child.GetSpan());
                }
            }
 
            return null;
        }
    }
 
    private static JsonToken GetFirstToken(JsonNodeOrToken nodeOrToken)
        => nodeOrToken.IsNode ? GetFirstToken(nodeOrToken.Node.ChildAt(0)) : nodeOrToken.Token;
 
    private static EmbeddedDiagnostic? GetFirstDiagnostic(JsonNode node)
    {
        foreach (var child in node)
        {
            var diagnostic = GetFirstDiagnostic(child);
            if (diagnostic != null)
                return diagnostic;
        }
 
        return null;
    }
 
    private static EmbeddedDiagnostic? GetFirstDiagnostic(JsonNodeOrToken child)
        => child.IsNode
            ? GetFirstDiagnostic(child.Node)
            : GetFirstDiagnostic(child.Token);
 
    private static EmbeddedDiagnostic? GetFirstDiagnostic(JsonToken token)
        => GetFirstDiagnostic(token.LeadingTrivia) ?? token.Diagnostics.FirstOrNull() ?? GetFirstDiagnostic(token.TrailingTrivia);
 
    private static EmbeddedDiagnostic? GetFirstDiagnostic(ImmutableArray<JsonTrivia> list)
    {
        foreach (var trivia in list)
        {
            var diagnostic = trivia.Diagnostics.FirstOrNull();
            if (diagnostic != null)
                return diagnostic;
        }
 
        return null;
    }
 
    private ImmutableArray<JsonValueNode> ParseSequence()
    {
        try
        {
            _recursionDepth++;
            StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
            return ParseSequenceWorker();
        }
        finally
        {
            _recursionDepth--;
        }
    }
 
    private ImmutableArray<JsonValueNode> ParseSequenceWorker()
    {
        using var _ = ArrayBuilder<JsonValueNode>.GetInstance(out var result);
 
        while (ShouldConsumeSequenceElement())
            result.Add(ParseValue());
 
        return result.ToImmutableAndClear();
    }
 
    private JsonSeparatedList ParseCommaSeparatedSequence()
    {
        try
        {
            _recursionDepth++;
            StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
            return ParseCommaSeparatedSequenceWorker();
        }
        finally
        {
            _recursionDepth--;
        }
    }
 
    private JsonSeparatedList ParseCommaSeparatedSequenceWorker()
    {
        using var _ = ArrayBuilder<JsonNodeOrToken>.GetInstance(out var result);
        var allProperties = true;
        while (ShouldConsumeSequenceElement())
        {
            var value = ParseValue();
            allProperties = allProperties && value.Kind == JsonKind.Property;
            result.Add(value);
 
            // Try to consume a comma.  If we don't see one, consume an empty one as a placeholder. Create a
            // diagnostic message depending on if we've seen only properties before this point.  If not, don't give
            // a message about a missing comma.  Instead, we'll give a specific message that we didn't get a
            // property when we expected one.
            if (ShouldConsumeSequenceElement())
                result.Add(ConsumeToken(JsonKind.CommaToken, allProperties ? s_commaExpected : null));
        }
 
        return new JsonSeparatedList(result.ToImmutable());
    }
 
    private readonly bool ShouldConsumeSequenceElement()
        => _currentToken.Kind switch
        {
            JsonKind.EndOfFile => false,
            JsonKind.CloseBraceToken => !_inObject,
            JsonKind.CloseBracketToken => !_inArray,
            JsonKind.CloseParenToken => !_inConstructor,
            _ => true
        };
 
    private JsonValueNode ParseValue()
    {
        try
        {
            _recursionDepth++;
            StackGuard.EnsureSufficientExecutionStack(_recursionDepth);
 
            return _currentToken.Kind switch
            {
                JsonKind.OpenBraceToken => ParseObject(),
                JsonKind.OpenBracketToken => ParseArray(),
                JsonKind.CommaToken => ParseCommaValue(),
                _ => ParseLiteralOrPropertyOrConstructor(),
            };
        }
        finally
        {
            _recursionDepth--;
        }
    }
 
    private static void SplitLiteral(JsonToken literalToken, out JsonToken minusToken, out JsonToken newLiteralToken)
    {
        minusToken = CreateToken(
            JsonKind.MinusToken, literalToken.LeadingTrivia,
            literalToken.VirtualChars.GetSubSequence(new TextSpan(0, 1)),
            []);
        newLiteralToken = CreateToken(
            literalToken.Kind,
            [],
            literalToken.VirtualChars.GetSubSequence(TextSpan.FromBounds(1, literalToken.VirtualChars.Length)),
            literalToken.TrailingTrivia,
            literalToken.Diagnostics);
    }
 
    private JsonPropertyNode ParseProperty(JsonToken stringLiteralOrText)
    {
        Debug.Assert(_currentToken.Kind == JsonKind.ColonToken);
 
        // Kind could be anything else we might have parsed as a value (for example, an integer/boolean literal).
        if (stringLiteralOrText.Kind != JsonKind.StringToken)
            stringLiteralOrText = stringLiteralOrText.With(kind: JsonKind.TextToken);
 
        var colonToken = ConsumeCurrentToken();
 
        // Newtonsoft allows "{ a: , }" as a legal property. In that case, synthesize a missing value and allow the
        // comma to be parsed as the next value in the sequence.  The strict pass will error if it sees this missing
        // comma-value as the value of a property.
        if (_currentToken.Kind == JsonKind.CommaToken)
        {
            return new JsonPropertyNode(
                stringLiteralOrText, colonToken,
                new JsonCommaValueNode(CreateMissingToken(JsonKind.CommaToken)));
        }
        else if (_currentToken.Kind == JsonKind.EndOfFile)
        {
            return new JsonPropertyNode(
                stringLiteralOrText, colonToken,
                new JsonCommaValueNode(CreateMissingToken(JsonKind.CommaToken).AddDiagnosticIfNone(new EmbeddedDiagnostic(
                    FeaturesResources.Missing_property_value,
                    GetTokenStartPositionSpan(_currentToken)))));
        }
 
        var value = ParseValue();
        if (value.Kind == JsonKind.Property)
        {
            // It's always illegal to have something like  ```"a" : "b" : 1```
            var nestedProperty = (JsonPropertyNode)value;
            value = new JsonPropertyNode(
                nestedProperty.NameToken,
                nestedProperty.ColonToken.AddDiagnosticIfNone(new EmbeddedDiagnostic(
                    FeaturesResources.Nested_properties_not_allowed,
                    nestedProperty.ColonToken.GetSpan())),
                nestedProperty.Value);
        }
 
        return new JsonPropertyNode(stringLiteralOrText, colonToken, value);
    }
 
    private JsonValueNode ParseLiteralOrPropertyOrConstructor()
    {
        var textToken = ConsumeCurrentToken();
        return _currentToken.Kind == JsonKind.ColonToken
            ? ParseProperty(textToken)
            : ParseLiteralOrTextOrConstructor(textToken);
    }
 
    private JsonValueNode ParseLiteralOrTextOrConstructor(JsonToken token)
    {
        if (token.Kind == JsonKind.StringToken)
            return new JsonLiteralNode(token);
 
        // Look for constructors (a json.net extension).  We'll report them as an error
        // in strict model.
        if (Matches(token, "new"))
            return ParseConstructor(token);
 
        // Check for certain literal values.  Some of these (like NaN) are json.net only.
        // We'll check for these later in the strict-mode pass.
        Debug.Assert(token.VirtualChars.Length > 0);
        if (TryMatch(token, "NaN", JsonKind.NaNLiteralToken, out var newKind) ||
            TryMatch(token, "null", JsonKind.NullLiteralToken, out newKind) ||
            TryMatch(token, "true", JsonKind.TrueLiteralToken, out newKind) ||
            TryMatch(token, "false", JsonKind.FalseLiteralToken, out newKind) ||
            TryMatch(token, "Infinity", JsonKind.InfinityLiteralToken, out newKind) ||
            TryMatch(token, "undefined", JsonKind.UndefinedLiteralToken, out newKind))
        {
            return new JsonLiteralNode(token.With(kind: newKind));
        }
 
        if (Matches(token, "-Infinity"))
        {
            SplitLiteral(token, out var minusToken, out var newLiteralToken);
 
            return new JsonNegativeLiteralNode(
                minusToken, newLiteralToken.With(kind: JsonKind.InfinityLiteralToken));
        }
 
        var firstChar = token.VirtualChars[0];
        if (firstChar == '-' || firstChar == '.' || IsDigit(firstChar))
            return new JsonLiteralNode(token.With(kind: JsonKind.NumberToken));
 
        return new JsonTextNode(
            token.With(kind: JsonKind.TextToken).AddDiagnosticIfNone(new EmbeddedDiagnostic(
                string.Format(FeaturesResources._0_unexpected, firstChar.ToString()),
                firstChar.Span)));
    }
 
    private JsonConstructorNode ParseConstructor(JsonToken token)
    {
        var savedInConstructor = _inConstructor;
        _inConstructor = true;
 
        var result = new JsonConstructorNode(
            token.With(kind: JsonKind.NewKeyword),
            ConsumeToken(JsonKind.TextToken, FeaturesResources.Name_expected),
            ConsumeToken(JsonKind.OpenParenToken, s_openParenExpected),
            ParseSequence(),
            ConsumeToken(JsonKind.CloseParenToken, s_closeParenExpected));
 
        _inConstructor = savedInConstructor;
        return result;
    }
 
    private static bool TryMatch(JsonToken token, string val, JsonKind kind, out JsonKind newKind)
    {
        if (Matches(token, val))
        {
            newKind = kind;
            return true;
        }
 
        newKind = default;
        return false;
    }
 
    private static bool Matches(JsonToken token, string val)
    {
        var chars = token.VirtualChars;
        if (chars.Length != val.Length)
            return false;
 
        for (var i = 0; i < val.Length; i++)
        {
            if (chars[i] != val[i])
                return false;
        }
 
        return true;
    }
 
    private static bool IsDigit(VirtualChar ch)
        => ch.Value is >= '0' and <= '9';
 
    private JsonCommaValueNode ParseCommaValue()
        => new(ConsumeCurrentToken());
 
    private JsonArrayNode ParseArray()
    {
        var savedInArray = _inArray;
        _inArray = true;
 
        var result = new JsonArrayNode(
            ConsumeCurrentToken(),
            ParseSequence(),
            ConsumeToken(JsonKind.CloseBracketToken, s_closeBracketExpected));
 
        _inArray = savedInArray;
        return result;
    }
 
    private JsonObjectNode ParseObject()
    {
        var savedInObject = _inObject;
        _inObject = true;
 
        var result = new JsonObjectNode(
            ConsumeCurrentToken(),
            ParseCommaSeparatedSequence(),
            ConsumeToken(JsonKind.CloseBraceToken, s_closeBraceExpected));
 
        _inObject = savedInObject;
        return result;
    }
 
    private JsonToken ConsumeToken(JsonKind kind, string? error)
    {
        if (_currentToken.Kind == kind)
            return ConsumeCurrentToken();
 
        var result = CreateMissingToken(kind);
        if (error == null)
            return result;
 
        return result.AddDiagnosticIfNone(new EmbeddedDiagnostic(error, GetTokenStartPositionSpan(_currentToken)));
    }
 
    private readonly TextSpan GetTokenStartPositionSpan(JsonToken token)
        => token.Kind == JsonKind.EndOfFile
            ? new TextSpan(_lexer.Text.Last().Span.End, 0)
            : new TextSpan(token.VirtualChars[0].Span.Start, 0);
}
File: EmbeddedLanguages\Json\JsonParser.cs	Web Access
Project: src\src\Features\Core\Portable\Microsoft.CodeAnalysis.Features.csproj (Microsoft.CodeAnalysis.Features)