File: Expression\Lexer.cs
Web Access
Project: src\src\Microsoft.ML.Transforms\Microsoft.ML.Transforms.csproj (Microsoft.ML.Transforms)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;
 
namespace Microsoft.ML.Transforms
{
    /// <summary>
    /// The lexer. This is effectively a template. Call LexSource to get an Enumerable of tokens.
    /// </summary>
    [BestFriend]
    internal partial class Lexer
    {
        private readonly NormStr.Pool _pool;
        private readonly KeyWordTable _kwt;
 
        /// <summary>
        /// The constructor. Caller must provide the name pool and key word table.
        /// </summary>
        public Lexer(NormStr.Pool pool, KeyWordTable kwt)
        {
            Contracts.AssertValue(pool);
            Contracts.AssertValue(kwt);
            _pool = pool;
            _kwt = kwt;
        }
 
        public IEnumerable<Token> LexSource(CharCursor cursor)
        {
            Contracts.AssertValue(cursor);
 
            LexerImpl impl = new LexerImpl(this, cursor);
            Token tok;
            while ((tok = impl.GetNextToken()) != null)
                yield return tok;
            yield return impl.GetEof();
        }
 
        private partial class LexerImpl
        {
            private readonly Lexer _lex;
            private readonly CharCursor _cursor;
 
            private readonly StringBuilder _sb; // Used while building a token.
            private int _ichMinTok; // The start of the current token.
            private readonly Queue<Token> _queue; // For multiple returns.
#pragma warning disable 414
            // This will be used by any pre-processor, so keep it around.
            private bool _fLineStart;
#pragma warning restore 414
 
            public LexerImpl(Lexer lex, CharCursor cursor)
            {
                _lex = lex;
                _cursor = cursor;
                _sb = new StringBuilder();
                _queue = new Queue<Token>(4);
                _fLineStart = true;
            }
 
            /// <summary>
            /// Whether we've hit the end of input yet. If this returns true, ChCur will be zero.
            /// </summary>
            private bool Eof { get { return _cursor.Eof; } }
 
            /// <summary>
            /// The current character. Zero if we've hit the end of input.
            /// </summary>
            private char ChCur
            {
                get { return _cursor.ChCur; }
            }
 
            /// <summary>
            /// Advance to the next character and return it.
            /// </summary>
            private char ChNext()
            {
                return _cursor.ChNext();
            }
 
            private char ChPeek(int ich)
            {
                return _cursor.ChPeek(ich);
            }
 
            /// <summary>
            /// Marks the beginning of the current token.
            /// </summary>
            private void StartTok()
            {
                _ichMinTok = _cursor.IchCur;
            }
 
            /// <summary>
            /// Called to embed an error token in the stream.
            /// </summary>
            private void ReportError(ErrId eid)
            {
                ReportError(_ichMinTok, _cursor.IchCur, eid, null);
            }
 
            private void ReportError(ErrId eid, params object[] args)
            {
                ReportError(_ichMinTok, _cursor.IchCur, eid, args);
            }
 
            private void ReportError(int ichMin, int ichLim, ErrId eid, params object[] args)
            {
                // REVIEW: Fix this so the error is marked as nested if appropriate!
                ErrorToken err = new ErrorToken(GetTextSpan(ichMin, ichLim), eid, args);
                _queue.Enqueue(err);
            }
 
            private TextSpan GetSpan()
            {
                var span = new TextSpan(_ichMinTok, _cursor.IchCur);
                StartTok();
                return span;
            }
 
            private TextSpan GetTextSpan(int ichMin, int ichLim)
            {
                return new TextSpan(ichMin, ichLim);
            }
 
            /// <summary>
            /// Form and return the next token. Returns null to signal end of input.
            /// </summary>
            public Token GetNextToken()
            {
                // New line tokens and errors can be "nested" inside comments or string literals
                // so this code isn't as simple as lexing a single token and returning it.
                // Note that we return the outer token before nested ones.
 
                while (_queue.Count == 0)
                {
                    if (Eof)
                        return null;
                    Token tokNew = FetchToken();
                    if (tokNew != null)
                        return tokNew;
                }
 
                // Only new lines and errors should be enqueued.
                Token tok = _queue.Dequeue();
                Contracts.Assert(tok.Kind == TokKind.NewLine || tok.Kind == TokKind.Error);
                return tok;
            }
 
            /// <summary>
            /// Call once GetNextToken returns null if you need an Eof token.
            /// </summary>
            public EofToken GetEof()
            {
                Contracts.Assert(Eof);
                return new EofToken(GetTextSpan(_cursor.IchCur, _cursor.IchCur));
            }
 
            private Token FetchToken()
            {
                Contracts.Assert(!Eof);
                StartTok();
 
                LexStartKind kind = LexCharUtils.StartKind(ChCur);
                if (kind != LexStartKind.Space && kind != LexStartKind.PreProc)
                    _fLineStart = false;
 
                switch (kind)
                {
                    case LexStartKind.Punc:
                        return LexPunc();
                    case LexStartKind.NumLit:
                        return LexNumLit();
                    case LexStartKind.StrLit:
                        return LexStrLit();
                    case LexStartKind.Verbatim:
                        if (ChPeek(1) == '"')
                            return LexStrLit();
                        if (LexCharUtils.StartKind(ChPeek(1)) == LexStartKind.Ident)
                            return LexIdent();
                        ChNext();
                        ReportError(ErrId.VerbatimLiteralExpected);
                        return null;
                    case LexStartKind.Ident:
                        return LexIdent();
                    case LexStartKind.Comment:
                        return LexComment();
                    case LexStartKind.Space:
                        return LexSpace();
                    case LexStartKind.LineTerm:
                        LexLineTerm();
                        return null;
                    case LexStartKind.PreProc:
                        return LexPreProc();
                    default:
                        return LexError();
                }
            }
 
            /// <summary>
            /// Called to lex a punctuator (operator). Asserts the current character lex type
            /// is LexCharType.Punc.
            /// </summary>
            private Token LexPunc()
            {
                int cchPunc = 0;
                TokKind tidPunc = TokKind.None;
 
                _sb.Length = 0;
                _sb.Append(ChCur);
                for (; ; )
                {
                    TokKind tidCur;
                    NormStr nstr = _lex._pool.Add(_sb);
                    if (!_lex._kwt.IsPunctuator(nstr, out tidCur))
                        break;
 
                    if (tidCur != TokKind.None)
                    {
                        // This is a real punctuator, not just a prefix.
                        tidPunc = tidCur;
                        cchPunc = _sb.Length;
                    }
 
                    char ch = ChPeek(_sb.Length);
                    if (!LexCharUtils.IsPunc(ch))
                        break;
                    _sb.Append(ch);
                }
                if (cchPunc == 0)
                    return LexError();
                while (--cchPunc >= 0)
                    ChNext();
                return KeyToken.Create(GetSpan(), tidPunc);
            }
 
            /// <summary>
            /// Called to lex a numeric literal or a Dot token. Asserts the current
            /// character lex type is LexCharType.NumLit.
            /// </summary>
            private Token LexNumLit()
            {
                Contracts.Assert(LexCharUtils.StartKind(ChCur) == LexStartKind.NumLit);
                Contracts.Assert(LexCharUtils.IsDigit(ChCur) || ChCur == '.');
 
                // A dot not followed by a digit is just a Dot. This is a very common case (hence first).
                if (ChCur == '.' && !LexCharUtils.IsDigit(ChPeek(1)))
                    return LexPunc();
 
                // Check for a hex literal. Note that 0x followed by a non-hex-digit is really a 0 followed
                // by an identifier.
                if (ChCur == '0' && (ChPeek(1) == 'x' || ChPeek(1) == 'X') && LexCharUtils.IsHexDigit(ChPeek(2)))
                {
                    // Advance to first hex digit.
                    ChNext();
                    ChNext();
                    return LexHexInt();
                }
 
                // Decimal literal (possible floating point).
                Contracts.Assert(LexCharUtils.IsDigit(ChCur) || ChCur == '.' && LexCharUtils.IsDigit(ChPeek(1)));
                bool fExp = false;
                bool fDot = ChCur == '.';
                _sb.Length = 0;
                _sb.Append(ChCur);
 
                for (; ; )
                {
                    if (ChNext() == '.')
                    {
                        if (fDot || !LexCharUtils.IsDigit(ChPeek(1)))
                            break;
                        fDot = true;
                    }
                    else if (!LexCharUtils.IsDigit(ChCur))
                        break;
                    _sb.Append(ChCur);
                }
 
                // Check for an exponent.
                if (ChCur == 'e' || ChCur == 'E')
                {
                    char chTmp = ChPeek(1);
                    if (LexCharUtils.IsDigit(chTmp) || (chTmp == '+' || chTmp == '-') && LexCharUtils.IsDigit(ChPeek(2)))
                    {
                        fExp = true;
                        _sb.Append(ChCur);
                        _sb.Append(ChNext());
                        while (LexCharUtils.IsDigit(chTmp = ChNext()))
                            _sb.Append(chTmp);
                    }
                }
 
                bool fReal = fDot || fExp;
                char chSuf = LexRealSuffix(fReal);
                if (fReal || chSuf != '\0')
                    return LexRealNum(chSuf);
 
                // Integer type.
                return LexDecInt(LexIntSuffix());
            }
 
            /// <summary>
            /// Lex a hex literal optionally followed by an integer suffix. Asserts the current
            /// character is a hex digit.
            /// </summary>
            private Token LexHexInt()
            {
                Contracts.Assert(LexCharUtils.IsHexDigit(ChCur));
 
                ulong u = 0;
                bool fOverflow = false;
 
                do
                {
                    if ((u & 0xF000000000000000) != 0 && !fOverflow)
                    {
                        ReportError(ErrId.IntOverflow);
                        fOverflow = true;
                    }
                    u = (u << 4) + (ulong)LexCharUtils.GetHexVal(ChCur);
                } while (LexCharUtils.IsHexDigit(ChNext()));
 
                if (fOverflow)
                    u = ulong.MaxValue;
 
                return new IntLitToken(GetSpan(), u, LexIntSuffix() | IntLitKind.Hex);
            }
 
            /// <summary>
            /// Lex a decimal integer literal. The digits must be in _sb.
            /// </summary>
            private Token LexDecInt(IntLitKind ilk)
            {
                // Digits are in _sb.
                Contracts.Assert(_sb.Length > 0);
                ulong u = 0;
 
                try
                {
                    for (int ich = 0; ich < _sb.Length; ich++)
                        u = checked(u * 10 + (ulong)LexCharUtils.GetDecVal(_sb[ich]));
                }
                catch (System.OverflowException)
                {
                    ReportError(ErrId.IntOverflow);
                    u = ulong.MaxValue;
                }
                return new IntLitToken(GetSpan(), u, ilk);
            }
 
            /// <summary>
            /// Lex a real literal (float, double or decimal). The characters should be in _sb.
            /// </summary>
            private Token LexRealNum(char chSuf)
            {
                // Digits are in _sb.
                Contracts.Assert(_sb.Length > 0);
 
                TextSpan span = GetSpan();
                switch (chSuf)
                {
                    default:
                        Contracts.Assert(chSuf == '\0' || chSuf == 'D');
                        try
                        {
                            double dbl = double.Parse(_sb.ToString(), NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent);
                            return new DblLitToken(span, dbl, chSuf != 0);
                        }
                        catch (OverflowException)
                        {
                            ReportError(ErrId.FloatOverflow, "double");
                            return new DblLitToken(span, double.PositiveInfinity, chSuf != 0);
                        }
                    case 'F':
                        try
                        {
                            double dbl = double.Parse(_sb.ToString(), NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent);
                            return new FltLitToken(span, (float)dbl);
                        }
                        catch (OverflowException)
                        {
                            ReportError(ErrId.FloatOverflow, "float");
                            return new FltLitToken(span, float.PositiveInfinity);
                        }
                }
            }
 
            /// <summary>
            /// Lex an optional integer suffix (U and/or L).
            /// </summary>
            private IntLitKind LexIntSuffix()
            {
                IntLitKind ilk = IntLitKind.None;
 
                for (; ; )
                {
                    if (ChCur == 'U' || ChCur == 'u')
                    {
                        if ((ilk & IntLitKind.Uns) != 0)
                            break;
                        ilk |= IntLitKind.Uns;
                    }
                    else if (ChCur == 'L' || ChCur == 'l')
                    {
                        if ((ilk & IntLitKind.Lng) != 0)
                            break;
                        ilk |= IntLitKind.Lng;
                    }
                    else
                        break;
                    ChNext();
                }
                return ilk;
            }
 
            /// <summary>
            /// Lex an optional real suffix (F, D, M).
            /// </summary>
            private char LexRealSuffix(bool fKnown)
            {
                char ch;
 
                switch (ChCur)
                {
                    default:
                        return '\0';
                    case 'd':
                    case 'D':
                        ch = 'D';
                        break;
                    case 'f':
                    case 'F':
                        ch = 'F';
                        break;
                    case 'l':
                    case 'L':
                        if (!fKnown)
                            return '\0';
                        ch = 'L';
                        break;
                }
                ChNext();
                return ch;
            }
 
            /// <summary>
            /// Lex a string or character literal.
            /// </summary>
            private Token LexStrLit()
            {
                char chQuote;
 
                _sb.Length = 0;
                if (ChCur == '@')
                {
                    chQuote = '"';
                    ChNext();
                    Contracts.Assert(ChCur == '"');
                    ChNext();
                    for (; ; )
                    {
                        char ch = ChCur;
                        if (ch == '"')
                        {
                            ChNext();
                            if (ChCur != '"')
                                break;
                            ChNext();
                        }
                        else if (LexCharUtils.IsLineTerm(ch))
                            ch = LexLineTerm(_sb);
                        else if (Eof)
                        {
                            ReportError(ErrId.UnterminatedString);
                            break;
                        }
                        else
                            ChNext();
                        _sb.Append(ch);
                    }
                }
                else
                {
                    Contracts.Assert(ChCur == '"' || ChCur == '\'');
                    chQuote = ChCur;
 
                    ChNext();
                    for (; ; )
                    {
                        char ch = ChCur;
                        if (ch == chQuote || Eof || LexCharUtils.IsLineTerm(ch))
                            break;
                        if (ch == '\\')
                        {
                            uint u;
                            if (!FLexEscChar(false, out u))
                                continue;
                            if (u < 0x10000)
                                ch = (char)u;
                            else
                            {
                                char chT;
                                if (!ConvertToSurrogatePair(u, out chT, out ch))
                                    continue;
                                _sb.Append(chT);
                            }
                        }
                        else
                            ChNext();
                        _sb.Append(ch);
                    }
 
                    if (ChCur != chQuote)
                        ReportError(ErrId.NewlineInConst);
                    else
                        ChNext();
                }
 
                if (chQuote == '"')
                    return new StrLitToken(GetSpan(), _sb.ToString());
 
                if (_sb.Length != 1)
                    ReportError(_sb.Length == 0 ? ErrId.CharConstEmpty : ErrId.CharConstTooLong);
                return new CharLitToken(GetSpan(), _sb.Length > 0 ? _sb[0] : '\0');
            }
 
            /// <summary>
            /// Lex a character escape. Returns true if successful (ch is valid).
            /// </summary>
            private bool FLexEscChar(bool fUniOnly, out uint u)
            {
                Contracts.Assert(ChCur == '\\');
 
                int ichErr = _cursor.IchCur;
                bool fUni;
                int cchHex;
 
                switch (ChNext())
                {
                    case 'u':
                        fUni = true;
                        cchHex = 4;
                        goto LHex;
                    case 'U':
                        fUni = true;
                        cchHex = 8;
                        goto LHex;
                    default:
                        if (!fUniOnly)
                        {
                            switch (ChCur)
                            {
                                default:
                                    goto LBad;
                                case 'x':
                                case 'X':
                                    fUni = false;
                                    cchHex = 4;
                                    goto LHex;
                                case '\'':
                                    u = 0x0027;
                                    break;
                                case '"':
                                    u = 0x0022;
                                    break;
                                case '\\':
                                    u = 0x005C;
                                    break;
                                case '0':
                                    u = 0x0000;
                                    break;
                                case 'a':
                                    u = 0x0007;
                                    break;
                                case 'b':
                                    u = 0x0008;
                                    break;
                                case 'f':
                                    u = 0x000C;
                                    break;
                                case 'n':
                                    u = 0x000A;
                                    break;
                                case 'r':
                                    u = 0x000D;
                                    break;
                                case 't':
                                    u = 0x0009;
                                    break;
                                case 'v':
                                    u = 0x000B;
                                    break;
                            }
                            ChNext();
                            return true;
                        }
LBad:
                        ReportError(ichErr, _cursor.IchCur, ErrId.BadEscape);
                        u = 0;
                        return false;
                }
 
LHex:
                bool fRet = true;
                ChNext();
 
                u = 0;
                for (int ich = 0; ich < cchHex; ich++)
                {
                    if (!LexCharUtils.IsHexDigit(ChCur))
                    {
                        fRet = (ich > 0);
                        if (fUni || !fRet)
                            ReportError(ichErr, _cursor.IchCur, ErrId.BadEscape);
                        break;
                    }
                    u = (u << 4) + (uint)LexCharUtils.GetHexVal(ChCur);
                    ChNext();
                }
                return fRet;
            }
 
            /// <summary>
            /// Convert the pair of characters to a surrogate pair.
            /// </summary>
            private bool ConvertToSurrogatePair(uint u, out char ch1, out char ch2)
            {
                Contracts.Assert(u > 0x0000FFFF);
                if (u > 0x0010FFFF)
                {
                    ReportError(ErrId.BadEscape);
                    ch1 = ch2 = '\0';
                    return false;
                }
                ch1 = (char)((u - 0x10000) / 0x400 + 0xD800);
                ch2 = (char)((u - 0x10000) % 0x400 + 0xDC00);
                return true;
            }
 
            /// <summary>
            /// Lex an identifier.
            /// </summary>
            private Token LexIdent()
            {
                bool fVerbatim = false;
                if (ChCur == '@')
                {
                    fVerbatim = true;
                    ChNext();
                }
 
                NormStr nstr = LexIdentCore(ref fVerbatim);
                if (nstr == null)
                {
                    // Error already reported.
                    return null;
                }
 
                if (!fVerbatim)
                {
                    KeyWordTable.KeyWordKind kind;
                    if (_lex._kwt.IsKeyWord(nstr, out kind))
                        return KeyToken.CreateKeyWord(GetSpan(), nstr.Value.ToString(), kind.Kind, kind.IsContextKeyWord);
                }
                return new IdentToken(GetSpan(), nstr.Value.ToString());
            }
 
            private NormStr LexIdentCore(ref bool fVerbatim)
            {
                Contracts.Assert(LexCharUtils.IsIdentStart(ChCur));
 
                _sb.Length = 0;
                for (; ; )
                {
                    char ch;
                    if (ChCur == '\\')
                    {
                        uint u;
                        int ichErr = _cursor.IchCur;
 
                        if (!FLexEscChar(true, out u))
                            break;
                        if (u > 0xFFFF || !LexCharUtils.IsIdent(ch = (char)u))
                        {
                            ReportError(ichErr, _cursor.IchCur, ErrId.BadChar, LexCharUtils.GetUniEscape(u));
                            break;
                        }
                        fVerbatim = true;
                    }
                    else
                    {
                        if (!LexCharUtils.IsIdent(ChCur))
                            break;
                        ch = ChCur;
                        ChNext();
                    }
                    Contracts.Assert(LexCharUtils.IsIdent(ch));
                    if (!LexCharUtils.IsFormat(ch))
                        _sb.Append(ch);
                }
 
                if (_sb.Length == 0)
                    return null;
 
                return _lex._pool.Add(_sb);
            }
 
            /// <summary>
            /// Lex a comment.
            /// </summary>
            private Token LexComment()
            {
                Contracts.Assert(ChCur == '/');
                int ichErr = _cursor.IchCur;
 
                switch (ChPeek(1))
                {
                    default:
                        return LexPunc();
                    case '/':
                        // Single line comment.
                        ChNext();
                        _sb.Length = 0;
                        _sb.Append("//");
                        for (; ; )
                        {
                            if (LexCharUtils.IsLineTerm(ChNext()) || Eof)
                                return new CommentToken(GetSpan(), _sb.ToString(), 0);
                            _sb.Append(ChCur);
                        }
                    case '*':
                        /* block comment */
                        ChNext();
                        _sb.Length = 0;
                        _sb.Append("/*");
                        ChNext();
                        int lines = 0;
                        for (; ; )
                        {
                            if (Eof)
                            {
                                ReportError(ichErr, _cursor.IchCur, ErrId.UnterminatedComment);
                                break;
                            }
                            char ch = ChCur;
                            if (LexCharUtils.IsLineTerm(ch))
                            {
                                ch = LexLineTerm(_sb);
                                lines++;
                            }
                            else
                                ChNext();
                            _sb.Append(ch);
                            if (ch == '*' && ChCur == '/')
                            {
                                _sb.Append('/');
                                ChNext();
                                break;
                            }
                        }
                        // We support comment keywords.
                        KeyWordTable.KeyWordKind kind;
                        NormStr nstr = _lex._pool.Add(_sb);
                        if (_lex._kwt.IsKeyWord(nstr, out kind))
                            return KeyToken.CreateKeyWord(GetSpan(), nstr.ToString(), kind.Kind, kind.IsContextKeyWord);
                        return new CommentToken(GetSpan(), _sb.ToString(), lines);
                }
            }
 
            /// <summary>
            /// Lex a sequence of spacing characters.
            /// Always returns null.
            /// </summary>
            private Token LexSpace()
            {
                Contracts.Assert(LexCharUtils.StartKind(ChCur) == LexStartKind.Space);
                while (LexCharUtils.IsSpace(ChNext()))
                    ;
                return null;
            }
 
            /// <summary>
            /// Lex a line termination character. Transforms CRLF into a single LF.
            /// Updates the line mapping. When this "drops" a character and sb is not
            /// null, it adds the character to sb. It does NOT add the returned character
            /// to the sb.
            /// </summary>
            private char LexLineTerm(StringBuilder sb = null)
            {
                Contracts.Assert(LexCharUtils.StartKind(ChCur) == LexStartKind.LineTerm);
                int ichMin = _cursor.IchCur;
                if (ChCur == '\xD' && ChPeek(1) == '\xA')
                {
                    if (sb != null)
                        sb.Append(ChCur);
                    ChNext();
                }
                char ch = ChCur;
                ChNext();
 
                if (_ichMinTok == ichMin)
                {
                    // Not nested.
                    _queue.Enqueue(new NewLineToken(GetSpan(), false));
                }
                else
                {
                    // Is nested.
                    _queue.Enqueue(new NewLineToken(GetTextSpan(ichMin, _cursor.IchCur), true));
                }
                _fLineStart = true;
                return ch;
            }
 
            private Token LexPreProc()
            {
                // We don't currently support pre-processing.
                return LexError();
            }
 
            /// <summary>
            /// Skip over an error character. Always returns null.
            /// REVIEW: Should we skip over multiple?
            /// </summary>
            private Token LexError()
            {
                _sb.Length = 0;
                do
                {
                    _sb.AppendFormat("{0}({1})", ChCur, LexCharUtils.GetUniEscape(ChCur));
                } while (LexCharUtils.StartKind(ChNext()) == LexStartKind.None && !Eof);
                return new ErrorToken(GetSpan(), ErrId.BadChar, _sb.ToString());
            }
        }
    }
}