CSharpTokenizer

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.IO;
 
using Microsoft.Build.Shared.LanguageParser;
using Xunit;
 
#nullable disable
 
namespace Microsoft.Build.UnitTests
{
    public sealed class CSharpTokenizerTests
    {
        // Simple whitespace handling.
        [Fact]
        public void Empty() { AssertTokenize("", "", 0); }
        [Fact]
        public void OneSpace() { AssertTokenize(" ", " \x0d", ".Whitespace"); }
        [Fact]
        public void TwoSpace() { AssertTokenize("  ", "  \x0d", ".Whitespace"); }
        [Fact]
        public void Tab() { AssertTokenize("\t", "\t\x0d", ".Whitespace"); }
        [Fact]
        public void TwoTab() { AssertTokenize("\t\t", "\t\t\x0d", ".Whitespace"); }
        [Fact]
        public void SpaceTab() { AssertTokenize(" \t", " \t\x0d", ".Whitespace"); }
        [Fact]
        public void CrLf() { AssertTokenize("\x0d\x0a", ".Whitespace"); }
        [Fact]
        public void SpaceCrLfSpace() { AssertTokenize(" \x0d\x0a ", " \x0d\x0a \x0d", ".Whitespace"); }
        // From section 2.3.3 of the C# spec, these are also whitespace.
        [Fact]
        public void LineSeparator() { AssertTokenizeUnicode("\x2028", ".Whitespace"); }
        [Fact]
        public void ParagraphSeparator() { AssertTokenizeUnicode("\x2029", ".Whitespace"); }
 
        /*
            Special whitespace handling.
                Horizontal tab character (U+0009)
                Vertical tab character (U+000B)
                Form feed character (U+000C)
        */
        [Fact]
        public void SpecialWhitespace() { AssertTokenize("\x09\x0b\x0c\x0d", ".Whitespace"); }
 
        // One-line comments (i.e. those starting with //)
        [Fact]
        public void OneLineComment() { AssertTokenize("// My one line comment.\x0d", ".Comment.Whitespace"); }
        [Fact]
        public void SpaceOneLineComment() { AssertTokenize(" // My one line comment.\x0d", ".Whitespace.Comment.Whitespace"); }
        [Fact]
        public void OneLineCommentTab() { AssertTokenize(" //\tMy one line comment.\x0d", ".Whitespace.Comment.Whitespace"); }
        [Fact]
        public void OneLineCommentCr() { AssertTokenize("// My one line comment.\x0d", ".Comment.Whitespace"); }
        [Fact]
        public void OneLineCommentLf() { AssertTokenize("// My one line comment.\x0a", ".Comment.Whitespace"); }
        [Fact]
        public void OneLineCommentLineSeparator() { AssertTokenizeUnicode("// My one line comment.\x2028", ".Comment.Whitespace"); }
        [Fact]
        public void OneLineCommentParagraphSeparator() { AssertTokenizeUnicode("// My one line comment.\x2029", ".Comment.Whitespace"); }
        [Fact]
        public void OneLineCommentWithEmbeddedMultiLine() { AssertTokenize("// /*  */\x0d", ".Comment.Whitespace"); }
 
        // Multi-line comments (i.e those like /* */)
        [Fact]
        public void OneLineMultilineComment() { AssertTokenize("/* My comment. */\x0d", ".Comment.Whitespace"); }
        [Fact]
        public void MultilineComment() { AssertTokenize("/* My comment. \x0d\x0a Second Line*/\x0d", ".Comment.Whitespace", 3); }
        [Fact]
        public void MultilineCommentWithEmbeddedSingleLine() { AssertTokenize("/* // */\x0d", ".Comment.Whitespace"); }
        [Fact]
        public void LeftHalfOfUnbalanceMultilineComment() { AssertTokenize("/*\x0d", ".EndOfFileInsideComment"); }
        [Fact]
        public void LeftHalfOfUnbalanceMultilineCommentWithStuff() { AssertTokenize("/* unbalanced\x0d", ".EndOfFileInsideComment"); }
 
        // If the last character of the source file is a Control-Z character (U+001A), this character is deleted.
        [Fact]
        public void NothingPlustControlZatEOF() { AssertTokenize("\x1A", "", "", 0); }
        [Fact]
        public void SomethingPlusControlZatEOF() { AssertTokenize("// My comment\x1A", "// My comment\x0d", ".Comment.Whitespace"); }
 
        // A carriage-return character (U+000D) is added to the end of the source file if that source file is non-empty and if the last character
        // of the source file is not a carriage return (U+000D), a line feed (U+000A), a line separator (U+2028), or a paragraph separator
        // (U+2029).
        [Fact]
        public void NoEOLatEOF() { AssertTokenize("// My comment", "// My comment\x0d", ".Comment.Whitespace"); }
        [Fact]
        public void NoEOLatEOFButFileIsEmpty() { AssertTokenize("", "", "", 0); }
 
        // An identifier that has a "_" embedded somewhere
        [Fact]
        public void IdentifierWithEmbeddedUnderscore() { AssertTokenize("_x_\xd", ".Identifier.Whitespace"); }
 
        // An identifier with a number
        [Fact]
        public void IdentifierWithNumber() { AssertTokenize("x3\xd", ".Identifier.Whitespace"); }
 
        // An non-identifier with a @ and a number
        [Fact]
        public void EscapedIdentifierWithNumber() { AssertTokenize("@3Identifier\xd", ".ExpectedIdentifier"); }
 
        // A very simple namespace and class.
        [Fact]
        public void NamespacePlusClass()
        {
            AssertTokenize(
            "namespace MyNamespace { class MyClass {} }\x0d",
             ".Keyword.Whitespace.Identifier.Whitespace.OpenScope.Whitespace.Keyword.Whitespace.Identifier.Whitespace.OpenScope.CloseScope.Whitespace.CloseScope.Whitespace");
        }
 
        // If a keyword has '@' in front, then its treated as an identifier.
        [Fact]
        public void EscapedKeywordMakesIdentifier()
        {
            AssertTokenize(
                "namespace @namespace { class @class {} }\x0d",
                "namespace namespace { class class {} }\x0d",       // Resulting tokens have '@' stripped.
                ".Keyword.Whitespace.Identifier.Whitespace.OpenScope.Whitespace.Keyword.Whitespace.Identifier.Whitespace.OpenScope.CloseScope.Whitespace.CloseScope.Whitespace");
        }
 
        // Check boolean literals
        [Fact]
        public void LiteralTrue() { AssertTokenize("true\x0d", ".BooleanLiteral.Whitespace"); }
        [Fact]
        public void LiteralFalse() { AssertTokenize("false\x0d", ".BooleanLiteral.Whitespace"); }
        [Fact]
        public void LiteralNull() { AssertTokenize("null\x0d", ".NullLiteral.Whitespace"); }
 
        // Check integer literals
        [Fact]
        public void HexIntegerLiteral() { AssertTokenize("0x123F\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexUppercaseXIntegerLiteral() { AssertTokenize("0X1f23\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void IntegerLiteral() { AssertTokenize("123\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void InvalidHexIntegerWithNoneValid() { AssertTokenize("0xG\x0d", ".ExpectedValidHexDigit"); }
 
        // Hex literal long suffix: U u L l UL Ul uL ul LU Lu lU lu
        [Fact]
        public void HexIntegerLiteralUpperU() { AssertTokenize("0x123FU\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralLowerU() { AssertTokenize("0x123Fu\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralUpperL() { AssertTokenize("0x123FL\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralLowerL() { AssertTokenize("0x123Fl\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralUpperUUpperL() { AssertTokenize("0x123FUL\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralUpperULowerL() { AssertTokenize("0x123FUl\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralLowerUUpperL() { AssertTokenize("0x123FuL\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralUpperLUpperU() { AssertTokenize("0x123FLU\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralUpperLLowerU() { AssertTokenize("0x123FLu\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralLowerLUpperU() { AssertTokenize("0x123FlU\x0d", ".HexIntegerLiteral.Whitespace"); }
        [Fact]
        public void HexIntegerLiteralLowerLLowerU() { AssertTokenize("0x123Flu\x0d", ".HexIntegerLiteral.Whitespace"); }
 
        // Decimal literal long suffix: U u L l UL Ul uL ul LU Lu lU lu
        [Fact]
        public void DecimalIntegerLiteralUpperU() { AssertTokenize("1234U\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralLowerU() { AssertTokenize("1234u\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralUpperL() { AssertTokenize("1234L\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralLowerL() { AssertTokenize("1234l\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralUpperUUpperL() { AssertTokenize("1234UL\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralUpperULowerL() { AssertTokenize("1234Ul\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralLowerUUpperL() { AssertTokenize("1234uL\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralUpperLUpperU() { AssertTokenize("1234LU\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralUpperLLowerU() { AssertTokenize("1234Lu\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralLowerLUpperU() { AssertTokenize("1234lU\x0d", ".DecimalIntegerLiteral.Whitespace"); }
        [Fact]
        public void DecimalIntegerLiteralLowerLLowerU() { AssertTokenize("1234lu\x0d", ".DecimalIntegerLiteral.Whitespace"); }
 
        // Reals aren't supported yet.
        // Reals can take many different forms: 1.1, .1, 1.1e6, etc.
        // If you turn this on, please create test for the other forms too.
        [Fact(Skip = "Ignored in MSTest")]
        public void RealLiteral1() { AssertTokenize("1.1\x0d", ".RealLiteral.Whitespace"); }
 
        // Char literals aren't supported yet.
        [Fact]
        public void CharLiteral1() { AssertTokenize("'c'\x0d", ".CharLiteral.Whitespace"); }
 
        [Fact(Skip = "Ignored in MSTest")]
        public void CharLiteralIllegalEscapeSequence() { AssertTokenize("'\\z'\x0d", ".SyntaxErrorIllegalEscapeSequence"); }
 
        [Fact(Skip = "Ignored in MSTest")]
        public void CharLiteralHexEscapeSequence() { AssertTokenize("'\\x0022a'\x0d", "'\"a'\x0d", ".CharLiteral.Whitespace"); }
 
        // Check string literals
        [Fact]
        public void LiteralStringBasic() { AssertTokenize("\"string\"\x0d", ".StringLiteral.Whitespace"); }
        [Fact]
        public void LiteralStringAllEscapes() { AssertTokenize("\"\\'\\\"\\\\\\0\\a\\b\\f\\n\\r\\t\\x0\\v\"\x0d", ".StringLiteral.Whitespace"); }
        [Fact]
        public void LiteralStringUnclosed() { AssertTokenize("\"string\x0d", ".NewlineInsideString"); }
        [Fact]
        public void LiteralVerbatimStringBasic() { AssertTokenize("@\"string\"\x0d", "\"string\"\x0d", ".StringLiteral.Whitespace"); }
        [Fact]
        public void LiteralVerbatimStringAllEscapes() { AssertTokenize("@\"\\a\\b\\c\"\x0d", "\"\\a\\b\\c\"\x0d", ".StringLiteral.Whitespace"); }
        [Fact]
        public void LiteralVerbatimStringUnclosed() { AssertTokenize("@\"string\x0d", ".EndOfFileInsideString"); }
        [Fact]
        public void LiteralVerbatimStringQuoteEscapeSequence() { AssertTokenize("@\"\"\"\"\x0d", "\"\"\"\"\x0d", ".StringLiteral.Whitespace"); }
 
        // Single-digit operators and punctuators.
        [Fact]
        public void PunctuatorOpenBracket() { AssertTokenize("[\x0d", ".OperatorOrPunctuator.Whitespace"); }
        [Fact]
        public void PunctuatorCloseBracket() { AssertTokenize("]\x0d", ".OperatorOrPunctuator.Whitespace"); }
        [Fact]
        public void PunctuatorOpenParen() { AssertTokenize("(\x0d", ".OperatorOrPunctuator.Whitespace"); }
        [Fact]
        public void PunctuatorCloseParen() { AssertTokenize(")\x0d", ".OperatorOrPunctuator.Whitespace"); }
        [Fact]
        public void PunctuatorDot() { AssertTokenize(".\x0d", ".OperatorOrPunctuator.Whitespace"); }
        [Fact]
        public void PunctuatorColon() { AssertTokenize(":\x0d", ".OperatorOrPunctuator.Whitespace"); }
        [Fact]
        public void PunctuatorSemicolon() { AssertTokenize(";\x0d", ".OperatorOrPunctuator.Whitespace"); }
 
        // Preprocessor.
        [Fact]
        public void Preprocessor() { AssertTokenize("#if\x0d", ".OpenConditionalDirective.Whitespace"); }
 
 
        /*
        * Method:  AssertTokenize
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also, the source must be regenerated exactly when the tokens are concatenated
        * back together,
        */
        private static void AssertTokenize(string source, string expectedTokenKey)
        {
            // Most of the time, we expect the rebuilt source to be the same as the input source.
            AssertTokenize(source, source, expectedTokenKey);
        }
 
        /*
        * Method:  AssertTokenizeUnicode
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also, the source must be regenerated exactly when the tokens are concatenated
        * back together,
        */
        private static void AssertTokenizeUnicode(string source, string expectedTokenKey)
        {
            // Most of the time, we expect the rebuilt source to be the same as the input source.
            AssertTokenizeUnicode(source, source, expectedTokenKey);
        }
 
        /*
        * Method:  AssertTokenize
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also, the source must be regenerated exactly when the tokens are concatenated
        * back together,
        */
        private static void AssertTokenize(
           string source,
           string expectedTokenKey,
           int expectedLastLineNumber)
        {
            // Most of the time, we expect the rebuilt source to be the same as the input source.
            AssertTokenize(source, source, expectedTokenKey, expectedLastLineNumber);
        }
 
        /*
        * Method:  AssertTokenizeUnicode
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also, the source must be regenerated exactly when the tokens are concatenated
        * back together,
        */
        private static void AssertTokenizeUnicode(
           string source,
           string expectedTokenKey,
           int expectedLastLineNumber)
        {
            // Most of the time, we expect the rebuilt source to be the same as the input source.
            AssertTokenizeUnicode(source, source, expectedTokenKey, expectedLastLineNumber);
        }
 
        /*
        * Method:  AssertTokenize
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also compare the source that is regenerated by concatenating all of the tokens
        * to 'expectedSource'.
        */
        private static void AssertTokenize(
           string source,
           string expectedSource,
           string expectedTokenKey)
        {
            // Two lines is the most common test case.
            AssertTokenize(source, expectedSource, expectedTokenKey, 1);
        }
 
        /*
        * Method:  AssertTokenizeUnicode
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also compare the source that is regenerated by concatenating all of the tokens
        * to 'expectedSource'.
        */
        private static void AssertTokenizeUnicode(
           string source,
           string expectedSource,
           string expectedTokenKey)
        {
            // Two lines is the most common test case.
            AssertTokenizeUnicode(source, expectedSource, expectedTokenKey, 1);
        }
 
        /*
        * Method:  AssertTokenizeUnicode
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also compare the source that is regenerated by concatenating all of the tokens
        * to 'expectedSource'.
        */
        private static void AssertTokenizeUnicode(
           string source,
           string expectedSource,
           string expectedTokenKey,
           int expectedLastLineNumber)
        {
            AssertTokenizeStream(
                StreamHelpers.StringToStream(source, System.Text.Encoding.Unicode),
                expectedSource,
                expectedTokenKey,
                expectedLastLineNumber);
        }
 
        /*
        * Method:  AssertTokenize
        *
        * Tokenize a string ('source') and compare it to the expected set of tokens.
        * Also compare the source that is regenerated by concatenating all of the tokens
        * to 'expectedSource'.
        */
        private static void AssertTokenize(
           string source,
           string expectedSource,
           string expectedTokenKey,
           int expectedLastLineNumber)
        {
            // This version of AssertTokenize tests several different encodings.
            // The reason is that we want to be sure each of these works in the
            // various encoding formats supported by C#
            AssertTokenizeStream(StreamHelpers.StringToStream(source), expectedSource, expectedTokenKey, expectedLastLineNumber);
            AssertTokenizeStream(StreamHelpers.StringToStream(source, System.Text.Encoding.Unicode), expectedSource, expectedTokenKey, expectedLastLineNumber);
            AssertTokenizeStream(StreamHelpers.StringToStream(source, System.Text.Encoding.UTF8), expectedSource, expectedTokenKey, expectedLastLineNumber);
            AssertTokenizeStream(StreamHelpers.StringToStream(source, System.Text.Encoding.BigEndianUnicode), expectedSource, expectedTokenKey, expectedLastLineNumber);
            AssertTokenizeStream(StreamHelpers.StringToStream(source, System.Text.Encoding.UTF32), expectedSource, expectedTokenKey, expectedLastLineNumber);
            AssertTokenizeStream(StreamHelpers.StringToStream(source, System.Text.Encoding.ASCII), expectedSource, expectedTokenKey, expectedLastLineNumber);
        }
 
        /*
         * Method:  AssertTokenizeStream
         *
         * Tokenize a string ('source') and compare it to the expected set of tokens.
         * Also compare the source that is regenerated by concatenating all of the tokens
         * to 'expectedSource'.
         */
        private static void AssertTokenizeStream(
           Stream source,
           string expectedSource,
           string expectedTokenKey,
           int expectedLastLineNumber)
        {
            CSharpTokenizer tokens = new CSharpTokenizer(
                source,
                false);
            string results = "";
            string tokenKey = "";
            int lastLine = 0;
            bool syntaxError = false;
            foreach (Token t in tokens)
            {
                results += t.InnerText;
                lastLine = t.Line;
 
                if (!syntaxError)
                {
                    // Its not really a file name, but GetExtension serves the purpose of getting the class name without
                    // the namespace prepended.
                    string tokenClass = t.ToString();
                    int pos = tokenClass.LastIndexOfAny(new char[] { '+', '.' });
 
                    tokenKey += ".";
                    tokenKey += tokenClass.Substring(pos + 1);
                }
 
                if (t is SyntaxErrorToken)
                {
                    // Stop processing after the first syntax error because
                    // the order of tokens after this is an implementation detail and
                    // shouldn't be encoded into the unit tests.
                    syntaxError = true;
                }
            }
            tokenKey = tokenKey.Replace("Token", "");
            Console.WriteLine(tokenKey);
 
            Assert.Equal(expectedSource, results);
            Assert.Equal(expectedTokenKey, tokenKey);
            Assert.Equal(expectedLastLineNumber, lastLine);
        }
    }
}
File: CSharpTokenizer_Tests.cs	Web Access
Project: ..\..\..\src\Tasks.UnitTests\Microsoft.Build.Tasks.UnitTests.csproj (Microsoft.Build.Tasks.UnitTests)