|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System;
using System.IO;
#nullable disable
namespace Microsoft.Build.Shared.LanguageParser
{
/*
* Class: VisualBasicTokenEnumerator
*
* Given vb sources, enumerate over all tokens.
*
*/
internal sealed class VisualBasicTokenEnumerator : TokenEnumerator
{
// Reader over the sources.
private VisualBasicTokenCharReader _reader = null;
/*
* Method: TokenEnumerator
*
* Construct
*/
internal VisualBasicTokenEnumerator(Stream binaryStream, bool forceANSI)
{
_reader = new VisualBasicTokenCharReader(binaryStream, forceANSI);
}
/*
* Method: FindNextToken
*
* Find the next token. Return 'true' if one was found. False, otherwise.
*/
internal override bool FindNextToken()
{
int startPosition = _reader.Position;
// VB docs claim whitespace is Unicode category Zs. However,
// this category does not contain tabs. Assuming a less restrictive
// definition for whitespace...
if (_reader.SinkWhiteSpace())
{
while (_reader.SinkWhiteSpace())
{
}
// Now, we need to check for the line continuation character.
if (_reader.SinkLineContinuationCharacter()) // Line continuation is '_'
{
// Save the current position because we may need to come back here.
int savePosition = _reader.Position - 1;
// Skip all whitespace after the '_'
while (_reader.SinkWhiteSpace())
{
}
// Now, skip all the newlines.
// Need at least one newline for this to count as line continuation.
int count = 0;
while (_reader.SinkNewLine())
{
++count;
}
if (count > 0)
{
current = new VisualBasicTokenizer.LineContinuationToken();
return true;
}
// Otherwise, fall back to plain old whitespace.
_reader.Position = savePosition;
}
current = new WhitespaceToken();
return true;
}
// Line terminators are separate from whitespace and are significant.
else if (_reader.SinkNewLine())
{
// We want one token per line terminator.
current = new VisualBasicTokenizer.LineTerminatorToken();
return true;
}
// Check for a comment--either those that start with ' or rem.
else if (_reader.SinkLineCommentStart())
{
// Skip to the first EOL.
_reader.SinkToEndOfLine();
current = new CommentToken();
return true;
}
// Identifier or keyword?
else if
(
// VB allows escaping of identifiers by surrounding them with []
// In other words,
// Date is a keyword but,
// [Date] is an identifier.
_reader.CurrentCharacter == '[' ||
_reader.MatchNextIdentifierStart())
{
bool escapedIdentifier = false;
if (_reader.CurrentCharacter == '[')
{
escapedIdentifier = true;
_reader.SinkCharacter();
// Now, the next character must be an identifier start.
if (!_reader.SinkIdentifierStart())
{
current = new ExpectedIdentifierToken();
return true;
}
}
// Sink the rest of the identifier.
while (_reader.SinkIdentifierPart())
{
}
// If this was an escaped identifier the we need to get the terminating ']'.
if (escapedIdentifier)
{
if (!_reader.Sink("]"))
{
current = new ExpectedIdentifierToken();
return true;
}
}
else
{
// Escaped identifiers are not allowed to have trailing type character.
_reader.SinkTypeCharacter(); // Type character is optional.
}
// An identifier that is only a '_' is illegal because it is
// ambiguous with line continuation
string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition);
if (identifierOrKeyword == "_" || identifierOrKeyword == "[_]" || identifierOrKeyword == "[]")
{
current = new ExpectedIdentifierToken();
return true;
}
// Make an upper-case version in order to check whether this may be a keyword.
string upper = identifierOrKeyword.ToUpperInvariant();
switch (upper)
{
default:
if (Array.IndexOf(s_keywordList, upper) >= 0)
{
current = new KeywordToken();
return true;
}
// Create the token.
current = new IdentifierToken();
// Trim off the [] if this is an escaped identifier.
if (escapedIdentifier)
{
current.InnerText = identifierOrKeyword.Substring(1, identifierOrKeyword.Length - 2);
}
return true;
case "FALSE":
case "TRUE":
current = new BooleanLiteralToken();
return true;
}
}
// Is it a hex integer?
else if (_reader.SinkHexIntegerPrefix())
{
if (!_reader.SinkMultipleHexDigits())
{
current = new ExpectedValidHexDigitToken();
return true;
}
// Sink a suffix if there is one.
_reader.SinkIntegerSuffix();
current = new HexIntegerLiteralToken();
return true;
}
// Is it an octal integer?
else if (_reader.SinkOctalIntegerPrefix())
{
if (!_reader.SinkMultipleOctalDigits())
{
current = new VisualBasicTokenizer.ExpectedValidOctalDigitToken();
return true;
}
// Sink a suffix if there is one.
_reader.SinkIntegerSuffix();
current = new VisualBasicTokenizer.OctalIntegerLiteralToken();
return true;
}
// Is it a decimal integer?
else if (_reader.SinkMultipleDecimalDigits())
{
// Sink a suffix if there is one.
_reader.SinkDecimalIntegerSuffix();
current = new DecimalIntegerLiteralToken();
return true;
}
// Preprocessor line
else if (_reader.CurrentCharacter == '#')
{
if (_reader.SinkIgnoreCase("#if"))
{
current = new OpenConditionalDirectiveToken();
}
else if (_reader.SinkIgnoreCase("#end if"))
{
current = new CloseConditionalDirectiveToken();
}
else
{
current = new PreprocessorToken();
}
_reader.SinkToEndOfLine();
return true;
}
// Is it a separator?
else if (_reader.SinkSeparatorCharacter())
{
current = new VisualBasicTokenizer.SeparatorToken();
return true;
}
// Is it an operator?
else if (_reader.SinkOperator())
{
current = new OperatorToken();
return true;
}
// A string?
else if (_reader.Sink("\""))
{
do
{
// Inside a verbatim string "" is treated as a special character
while (_reader.Sink("\"\""))
{
}
}
while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"');
// Can't end a file inside a string
if (_reader.EndOfLines)
{
current = new EndOfFileInsideStringToken();
return true;
}
current = new StringLiteralToken();
return true;
}
// We didn't recognize the token, so this is a syntax error.
_reader.SinkCharacter();
current = new UnrecognizedToken();
return true;
}
private static readonly string[] s_keywordList =
{ "ADDHANDLER", "ADDRESSOF", "ANDALSO", "ALIAS",
"AND", "ANSI", "AS", "ASSEMBLY",
"AUTO", "BOOLEAN", "BYREF", "BYTE",
"BYVAL", "CALL", "CASE", "CATCH",
"CBOOL", "CBYTE", "CCHAR", "CDATE",
"CDEC", "CDBL", "CHAR", "CINT",
"CLASS", "CLNG", "COBJ", "CONST", "CONTINUE", "CSBYTE",
"CSHORT", "CSNG", "CSTR", "CTYPE", "CUINT", "CULNG", "CUSHORT",
"DATE", "DECIMAL", "DECLARE", "DEFAULT",
"DELEGATE", "DIM", "DIRECTCAST", "DO",
"DOUBLE", "EACH", "ELSE", "ELSEIF",
"END", "ENDIF", "ENUM", "ERASE", "ERROR",
"EVENT", "EXIT", "FALSE", "FINALLY",
"FOR", "FRIEND", "FUNCTION", "GET",
"GETTYPE", "GLOBAL", "GOSUB", "GOTO", "HANDLES",
"IF", "IMPLEMENTS", "IMPORTS", "IN",
"INHERITS", "INTEGER", "INTERFACE", "IS", "ISNOT",
"LET", "LIB", "LIKE", "LONG",
"LOOP", "ME", "MOD", "MODULE",
"MUSTINHERIT", "MUSTOVERRIDE", "MYBASE", "MYCLASS",
"NAMESPACE", "NARROWING", "NEW", "NEXT", "NOT",
"NOTHING", "NOTINHERITABLE", "NOTOVERRIDABLE", "OBJECT",
"OF", "ON", "OPERATOR", "OPTION", "OPTIONAL", "OR",
"ORELSE", "OVERLOADS", "OVERRIDABLE", "OVERRIDES",
"PARAMARRAY", "PARTIAL", "PRESERVE", "PRIVATE", "PROPERTY",
"PROTECTED", "PUBLIC", "RAISEEVENT", "READONLY",
"REDIM", "REM", "REMOVEHANDLER", "RESUME",
"RETURN", "SBYTE", "SELECT", "SET", "SHADOWS",
"SHARED", "SHORT", "SINGLE", "STATIC",
"STEP", "STOP", "STRING", "STRUCTURE",
"SUB", "SYNCLOCK", "THEN", "THROW",
"TO", "TRUE", "TRY", "TRYCAST", "TYPEOF",
"UNICODE", "UINTEGER", "ULONG", "UNTIL", "USHORT", "USING", "VARIANT", "WEND", "WHEN",
"WHILE", "WIDENING", "WITH", "WITHEVENTS", "WRITEONLY",
"XOR" };
/*
* Method: Reader
*
* Return the token char reader.
*/
internal override TokenCharReader Reader
{
get
{
return _reader;
}
}
}
}
|