|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// <spec>http://www.w3.org/TR/xpath#exprlex</spec>
//------------------------------------------------------------------------------
using System.Diagnostics;
namespace System.Xml.Xsl.XPath
{
// Extends XPathOperator enumeration
internal enum LexKind
{
Unknown, // Unknown lexeme
Or, // Operator 'or'
And, // Operator 'and'
Eq, // Operator '='
Ne, // Operator '!='
Lt, // Operator '<'
Le, // Operator '<='
Gt, // Operator '>'
Ge, // Operator '>='
Plus, // Operator '+'
Minus, // Operator '-'
Multiply, // Operator '*'
Divide, // Operator 'div'
Modulo, // Operator 'mod'
UnaryMinus, // Not used
Union, // Operator '|'
LastOperator = Union,
DotDot, // '..'
ColonColon, // '::'
SlashSlash, // Operator '//'
Number, // Number (numeric literal)
Axis, // AxisName
Name, // NameTest, NodeType, FunctionName, AxisName, second part of VariableReference
String, // Literal (string literal)
Eof, // End of the expression
FirstStringable = Name,
LastNonChar = Eof,
LParens = '(',
RParens = ')',
LBracket = '[',
RBracket = ']',
Dot = '.',
At = '@',
Comma = ',',
Star = '*', // NameTest
Slash = '/', // Operator '/'
Dollar = '$', // First part of VariableReference
RBrace = '}', // Used for AVTs
};
internal sealed class XPathScanner
{
private readonly string _xpathExpr;
private int _curIndex;
private char _curChar;
private LexKind _kind;
private string? _name;
private string? _prefix;
private string? _stringValue;
private bool _canBeFunction;
private int _lexStart;
private int _prevLexEnd;
private LexKind _prevKind;
private XPathAxis _axis;
public XPathScanner(string xpathExpr) : this(xpathExpr, 0) { }
public XPathScanner(string xpathExpr, int startFrom)
{
Debug.Assert(xpathExpr != null);
_xpathExpr = xpathExpr;
_kind = LexKind.Unknown;
SetSourceIndex(startFrom);
NextLex();
}
public string Source { get { return _xpathExpr; } }
public LexKind Kind { get { return _kind; } }
public int LexStart { get { return _lexStart; } }
public int LexSize { get { return _curIndex - _lexStart; } }
public int PrevLexEnd { get { return _prevLexEnd; } }
private void SetSourceIndex(int index)
{
Debug.Assert(0 <= index && index <= _xpathExpr.Length);
_curIndex = index - 1;
NextChar();
}
private void NextChar()
{
Debug.Assert(-1 <= _curIndex && _curIndex < _xpathExpr.Length);
_curIndex++;
if (_curIndex < _xpathExpr.Length)
{
_curChar = _xpathExpr[_curIndex];
}
else
{
Debug.Assert(_curIndex == _xpathExpr.Length);
_curChar = '\0';
}
}
public string Name
{
get
{
Debug.Assert(_kind == LexKind.Name);
Debug.Assert(_name != null);
return _name;
}
}
public string Prefix
{
get
{
Debug.Assert(_kind == LexKind.Name);
Debug.Assert(_prefix != null);
return _prefix;
}
}
public string RawValue
{
get
{
if (_kind == LexKind.Eof)
{
return LexKindToString(_kind);
}
else
{
return _xpathExpr.Substring(_lexStart, _curIndex - _lexStart);
}
}
}
public string StringValue
{
get
{
Debug.Assert(_kind == LexKind.String);
Debug.Assert(_stringValue != null);
return _stringValue;
}
}
// Returns true if the character following an QName (possibly after intervening
// ExprWhitespace) is '('. In this case the token must be recognized as a NodeType
// or a FunctionName unless it is an OperatorName. This distinction cannot be done
// without knowing the previous lexeme. For example, "or" in "... or (1 != 0)" may
// be an OperatorName or a FunctionName.
public bool CanBeFunction
{
get
{
Debug.Assert(_kind == LexKind.Name);
return _canBeFunction;
}
}
public XPathAxis Axis
{
get
{
Debug.Assert(_kind == LexKind.Axis);
Debug.Assert(_axis != XPathAxis.Unknown);
return _axis;
}
}
private void SkipSpace()
{
while (XmlCharType.IsWhiteSpace(_curChar))
{
NextChar();
}
}
private static bool IsAsciiDigit(char ch)
{
return unchecked((uint)(ch - '0')) <= 9;
}
public void NextLex()
{
_prevLexEnd = _curIndex;
_prevKind = _kind;
SkipSpace();
_lexStart = _curIndex;
switch (_curChar)
{
case '\0':
_kind = LexKind.Eof;
return;
case '(':
case ')':
case '[':
case ']':
case '@':
case ',':
case '$':
case '}':
_kind = (LexKind)_curChar;
NextChar();
break;
case '.':
NextChar();
if (_curChar == '.')
{
_kind = LexKind.DotDot;
NextChar();
}
else if (IsAsciiDigit(_curChar))
{
SetSourceIndex(_lexStart);
goto case '0';
}
else
{
_kind = LexKind.Dot;
}
break;
case ':':
NextChar();
if (_curChar == ':')
{
_kind = LexKind.ColonColon;
NextChar();
}
else
{
_kind = LexKind.Unknown;
}
break;
case '*':
_kind = LexKind.Star;
NextChar();
CheckOperator(true);
break;
case '/':
NextChar();
if (_curChar == '/')
{
_kind = LexKind.SlashSlash;
NextChar();
}
else
{
_kind = LexKind.Slash;
}
break;
case '|':
_kind = LexKind.Union;
NextChar();
break;
case '+':
_kind = LexKind.Plus;
NextChar();
break;
case '-':
_kind = LexKind.Minus;
NextChar();
break;
case '=':
_kind = LexKind.Eq;
NextChar();
break;
case '!':
NextChar();
if (_curChar == '=')
{
_kind = LexKind.Ne;
NextChar();
}
else
{
_kind = LexKind.Unknown;
}
break;
case '<':
NextChar();
if (_curChar == '=')
{
_kind = LexKind.Le;
NextChar();
}
else
{
_kind = LexKind.Lt;
}
break;
case '>':
NextChar();
if (_curChar == '=')
{
_kind = LexKind.Ge;
NextChar();
}
else
{
_kind = LexKind.Gt;
}
break;
case '"':
case '\'':
_kind = LexKind.String;
ScanString();
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
_kind = LexKind.Number;
ScanNumber();
break;
default:
if (XmlCharType.IsStartNCNameSingleChar(_curChar))
{
_kind = LexKind.Name;
_name = ScanNCName();
_prefix = string.Empty;
_canBeFunction = false;
_axis = XPathAxis.Unknown;
bool colonColon = false;
int saveSourceIndex = _curIndex;
// "foo:bar" or "foo:*" -- one lexeme (no spaces allowed)
// "foo::" or "foo ::" -- two lexemes, reported as one (AxisName)
// "foo:?" or "foo :?" -- lexeme "foo" reported
if (_curChar == ':')
{
NextChar();
if (_curChar == ':')
{ // "foo::" -> OperatorName, AxisName
NextChar();
colonColon = true;
SetSourceIndex(saveSourceIndex);
}
else
{ // "foo:bar", "foo:*" or "foo:?"
if (_curChar == '*')
{
NextChar();
_prefix = _name;
_name = "*";
}
else if (XmlCharType.IsStartNCNameSingleChar(_curChar))
{
_prefix = _name;
_name = ScanNCName();
// Look ahead for '(' to determine whether QName can be a FunctionName
saveSourceIndex = _curIndex;
SkipSpace();
_canBeFunction = (_curChar == '(');
SetSourceIndex(saveSourceIndex);
}
else
{ // "foo:?" -> OperatorName, NameTest
// Return "foo" and leave ":" to be reported later as an unknown lexeme
SetSourceIndex(saveSourceIndex);
}
}
}
else
{
SkipSpace();
if (_curChar == ':')
{ // "foo ::" or "foo :?"
NextChar();
if (_curChar == ':')
{
NextChar();
colonColon = true;
}
SetSourceIndex(saveSourceIndex);
}
else
{
_canBeFunction = (_curChar == '(');
}
}
if (!CheckOperator(false) && colonColon)
{
_axis = CheckAxis();
}
}
else
{
_kind = LexKind.Unknown;
NextChar();
}
break;
}
}
private bool CheckOperator(bool star)
{
LexKind opKind;
if (star)
{
opKind = LexKind.Multiply;
}
else
{
Debug.Assert(_prefix != null);
Debug.Assert(_name != null);
if (_prefix.Length != 0 || _name.Length > 3)
return false;
switch (_name)
{
case "or": opKind = LexKind.Or; break;
case "and": opKind = LexKind.And; break;
case "div": opKind = LexKind.Divide; break;
case "mod": opKind = LexKind.Modulo; break;
default: return false;
}
}
// If there is a preceding token and the preceding token is not one of '@', '::', '(', '[', ',' or an Operator,
// then a '*' must be recognized as a MultiplyOperator and an NCName must be recognized as an OperatorName.
if (_prevKind <= LexKind.LastOperator)
return false;
switch (_prevKind)
{
case LexKind.Slash:
case LexKind.SlashSlash:
case LexKind.At:
case LexKind.ColonColon:
case LexKind.LParens:
case LexKind.LBracket:
case LexKind.Comma:
case LexKind.Dollar:
return false;
}
_kind = opKind;
return true;
}
private XPathAxis CheckAxis()
{
_kind = LexKind.Axis;
switch (_name)
{
case "ancestor": return XPathAxis.Ancestor;
case "ancestor-or-self": return XPathAxis.AncestorOrSelf;
case "attribute": return XPathAxis.Attribute;
case "child": return XPathAxis.Child;
case "descendant": return XPathAxis.Descendant;
case "descendant-or-self": return XPathAxis.DescendantOrSelf;
case "following": return XPathAxis.Following;
case "following-sibling": return XPathAxis.FollowingSibling;
case "namespace": return XPathAxis.Namespace;
case "parent": return XPathAxis.Parent;
case "preceding": return XPathAxis.Preceding;
case "preceding-sibling": return XPathAxis.PrecedingSibling;
case "self": return XPathAxis.Self;
default: _kind = LexKind.Name; return XPathAxis.Unknown;
}
}
private void ScanNumber()
{
Debug.Assert(IsAsciiDigit(_curChar) || _curChar == '.');
while (IsAsciiDigit(_curChar))
{
NextChar();
}
if (_curChar == '.')
{
NextChar();
while (IsAsciiDigit(_curChar))
{
NextChar();
}
}
if ((_curChar & (~0x20)) == 'E')
{
NextChar();
if (_curChar == '+' || _curChar == '-')
{
NextChar();
}
while (IsAsciiDigit(_curChar))
{
NextChar();
}
throw CreateException(SR.XPath_ScientificNotation);
}
}
private void ScanString()
{
int startIdx = _curIndex + 1;
int endIdx = _xpathExpr.IndexOf(_curChar, startIdx);
if (endIdx < 0)
{
SetSourceIndex(_xpathExpr.Length);
throw CreateException(SR.XPath_UnclosedString);
}
_stringValue = _xpathExpr.Substring(startIdx, endIdx - startIdx);
SetSourceIndex(endIdx + 1);
}
private string ScanNCName()
{
Debug.Assert(XmlCharType.IsStartNCNameSingleChar(_curChar));
int start = _curIndex;
while (true)
{
if (XmlCharType.IsNCNameSingleChar(_curChar))
{
NextChar();
}
else
{
break;
}
}
return _xpathExpr.Substring(start, _curIndex - start);
}
public void PassToken(LexKind t)
{
CheckToken(t);
NextLex();
}
public void CheckToken(LexKind t)
{
Debug.Assert(LexKind.FirstStringable <= t);
if (_kind != t)
{
if (t == LexKind.Eof)
{
throw CreateException(SR.XPath_EofExpected, RawValue);
}
else
{
throw CreateException(SR.XPath_TokenExpected, LexKindToString(t), RawValue);
}
}
}
// May be called for the following tokens: Name, String, Eof, Comma, LParens, RParens, LBracket, RBracket, RBrace
private static string LexKindToString(LexKind t)
{
Debug.Assert(LexKind.FirstStringable <= t);
if (LexKind.LastNonChar < t)
{
Debug.Assert("()[].@,*/$}".Contains((char)t));
return char.ToString((char)t);
}
switch (t)
{
case LexKind.Name: return "<name>";
case LexKind.String: return "<string literal>";
case LexKind.Eof: return "<eof>";
default:
Debug.Fail($"Unexpected LexKind: {t}");
return string.Empty;
}
}
public XPathCompileException CreateException(string resId, params string[] args)
{
return new XPathCompileException(_xpathExpr, _lexStart, _curIndex, resId, args);
}
}
}
|