WordParser.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Immutable;
using System.Collections.ObjectModel;
using System.Text;
 
namespace Analyzer.Utilities
{
    /// <summary>
    ///     Provides <see langword="static"/> methods for parsing words from text.
    /// </summary>
    internal sealed class WordParser
    {
        // WordParser has two distinct modes; one where it breaks up only words in
        // a given piece of text, and the other where it breaks up both words
        // and individual compounds within words in a piece of text. Passing
        // WordParserOptions.None to the constructor (or Parse) causes it to enter
        // the former, and WordParserOptions.SplitCompoundWords the later.
        //
        // If you simply want to iterate over the words, you can avoid the
        // allocation of a Collection<String> if you manually construct WordParser
        // and use the NextWord method instead of using the static Parse method.
        //
        // [char]:      Represents any Unicode character
        // [A-Z]:       Represents any Unicode uppercase letter
        // [a-z]:       Represents any Unicode lowercase letter
        // [0-9]:       Represents the numbers 0 to 9
        // [letter]:    Represents any Unicode letter
        //
        // <words>      -> <prefix>(<word> | <notword>)+
        //
        // <notword>    -> !<word>
        //
        // <prefix>     -> [char]
        //
        // WordParserOptions.None:
        // <word>       -> ([0-9] | [letter])+
        //
        // WordParserOptions.SplitCompoundWords:
        // <word>       -> <numeric> | <uppercase> | <lowercase> | <nocase> | <csshex>
        // <numeric>    -> (<integer> | <hex>)
        // <integer>    -> [0-9]+
        // <hex>        -> [0x]([0-9] | [A-F] | [a-f])+
        // <uppercase>  -> ([A-Z](<lowercase> | <allcaps>)) | <allcaps>
        // <lowercase>  -> [a-z]+
        // <nocase>     -> ([letter] ![A-Z] ![a-z])+
        // <allcaps>    -> [A-Z]+(s) (unless next character is [a-z])
        // <csshex>     -> [#]([0-9] | [A-F] | [a-f])+
 
        private const char NullChar = '\0';
        private readonly WordParserOptions _options;
        private readonly StringBuilder _buffer;
        private readonly string _text;
        private string? _peekedWord;
        private int _index;
        private char _prefix;
 
        /// <summary>
        ///     Initializes a new instance of the <see cref="WordParser"/> class with the specified text and options.
        /// </summary>
        /// <param name="text">
        ///     A <see cref="string"/> containing the text to parse.
        /// </param>
        /// <param name="options">
        ///     One or more of the <see cref="WordParserOptions"/> specifying parsing and delimiting options.
        /// </param>
        /// <exception cref="ArgumentNullException">
        ///     <paramref name="text"/> is <see langword="null"/>.
        /// </exception>
        /// <exception cref="ArgumentException">
        ///     <paramref name="options"/> is not one or more of the <see cref="WordParserOptions"/> values.
        /// </exception>
        public WordParser(string text, WordParserOptions options) : this(text, options, NullChar)
        {
        }
 
        /// <summary>
        ///     Initializes a new instance of the <see cref="WordParser"/> class with the specified text, options and prefix.
        /// </summary>
        /// <param name="text">
        ///     A <see cref="string"/> containing the text to parse.
        /// </param>
        /// <param name="options">
        ///     One or more of the <see cref="WordParserOptions"/> specifying parsing and delimiting options.
        /// </param>
        /// <param name="prefix">
        ///     A <see cref="char"/> representing an optional prefix of <paramref name="text"/>, that if present,
        ///     will be returned as a separate token.
        /// </param>
        /// <exception cref="ArgumentNullException">
        ///     <paramref name="text"/> is <see langword="null"/>.
        /// </exception>
        /// <exception cref="ArgumentException">
        ///     <paramref name="options"/> is not one or more of the <see cref="WordParserOptions"/> values.
        /// </exception>
        public WordParser(string text, WordParserOptions options, char prefix)
        {
            if (options is < WordParserOptions.None or > (WordParserOptions.IgnoreMnemonicsIndicators | WordParserOptions.SplitCompoundWords))
            {
                throw new ArgumentException($"'{(int)options}' is invalid for enum type '{nameof(WordParserOptions)}'", nameof(options));
            }
 
            _text = text ?? throw new ArgumentNullException(nameof(text));
            _options = options;
            _buffer = new StringBuilder(text.Length);
            _prefix = prefix;
        }
 
        private bool SkipMnemonics =>
            (_options & WordParserOptions.IgnoreMnemonicsIndicators) == WordParserOptions.IgnoreMnemonicsIndicators;
 
        private bool SplitCompoundWords =>
            (_options & WordParserOptions.SplitCompoundWords) == WordParserOptions.SplitCompoundWords;
 
        /// <summary>
        ///     Returns the words contained in the specified text, delimiting based on the specified options.
        /// </summary>
        /// <param name="text">
        ///     A <see cref="string"/> containing the text to parse.
        /// </param>
        /// <param name="options">
        ///     One or more of the <see cref="WordParserOptions"/> specifying parsing and delimiting options.
        /// </param>
        /// <returns>
        ///     A <see cref="Collection{T}"/> of strings containing the words contained in <paramref name="text"/>.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        ///     <paramref name="text"/> is <see langword="null"/>.
        /// </exception>
        /// <exception cref="ArgumentException">
        ///     <paramref name="options"/> is not one or more of the <see cref="WordParserOptions"/> values.
        /// </exception>
        internal static Collection<string> Parse(string text, WordParserOptions options)
        {
            return Parse(text, options, NullChar);
        }
 
        /// <summary>
        ///     Returns the words contained in the specified text, delimiting based on the specified options.
        /// </summary>
        /// <param name="text">
        ///     A <see cref="string"/> containing the text to parse.
        /// </param>
        /// <param name="options">
        ///     One or more of the <see cref="WordParserOptions"/> specifying parsing and delimiting options.
        /// </param>
        /// <param name="prefix">
        ///     A <see cref="char"/> representing an optional prefix of <paramref name="text"/>, that if present,
        ///     will be returned as a separate token.
        /// </param>
        /// <returns>
        ///     A <see cref="Collection{T}"/> of strings containing the words contained in <paramref name="text"/>.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        ///     <paramref name="text"/> is <see langword="null"/>.
        /// </exception>
        /// <exception cref="ArgumentException">
        ///     <paramref name="options"/> is not one or more of the <see cref="WordParserOptions"/> values.
        /// </exception>
        internal static Collection<string> Parse(string text, WordParserOptions options, char prefix)
        {
            WordParser parser = new WordParser(text, options, prefix);
            Collection<string> words = [];
 
            string? word;
            while ((word = parser.NextWord()) != null)
            {
                words.Add(word);
            }
 
            return words;
        }
 
        /// <summary>
        ///     Returns a value indicating whether at least one of the specified words occurs, using a case-insensitive ordinal comparison, within the specified text.
        /// </summary>
        /// <param name="text">
        ///     A <see cref="string"/> containing the text to check.
        /// </param>
        /// <param name="options">
        ///     One or more of the <see cref="WordParserOptions"/> specifying parsing and delimiting options.
        /// </param>
        /// <param name="words">
        ///     A <see cref="string"/> array containing the words to seek.
        /// </param>
        /// <returns>
        ///     <see langword="true"/> if at least one of the elements within <paramref name="words"/> occurs within <paramref name="text"/>, otherwise, <see langword="false"/>.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        ///     <paramref name="text"/> is <see langword="null"/>.
        ///     <para>
        ///      -or-
        ///     </para>
        ///     <paramref name="words"/> is <see langword="null"/>.
        /// </exception>
        /// <exception cref="ArgumentException">
        ///     <paramref name="options"/> is not one or more of the <see cref="WordParserOptions"/> values.
        /// </exception>
        public static bool ContainsWord(string text, WordParserOptions options, ImmutableArray<string> words)
        {
            return ContainsWord(text, options, NullChar, words);
        }
 
        /// <summary>
        ///     Returns a value indicating whether at least one of the specified words occurs, using a case-insensitive ordinal comparison, within the specified text.
        /// </summary>
        /// <param name="text">
        ///     A <see cref="string"/> containing the text to check.
        /// </param>
        /// <param name="options">
        ///     One or more of the <see cref="WordParserOptions"/> specifying parsing and delimiting options.
        /// </param>
        /// <param name="prefix">
        ///     A <see cref="char"/> representing an optional prefix of <paramref name="text"/>, that if present,
        ///     will be returned as a separate token.
        /// </param>
        /// <param name="words">
        ///     A <see cref="string"/> array containing the words to seek.
        /// </param>
        /// <returns>
        ///     <see langword="true"/> if at least one of the elements within <paramref name="words"/> occurs within <paramref name="text"/>, otherwise, <see langword="false"/>.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        ///     <paramref name="text"/> is <see langword="null"/>.
        ///     <para>
        ///      -or-
        ///     </para>
        ///     <paramref name="words"/> is <see langword="null"/>.
        /// </exception>
        /// <exception cref="ArgumentException">
        ///     <paramref name="options"/> is not one or more of the <see cref="WordParserOptions"/> values.
        /// </exception>
        internal static bool ContainsWord(string text, WordParserOptions options, char prefix, ImmutableArray<string> words)
        {
            if (words.IsDefault)
            {
                throw new ArgumentNullException(nameof(words));
            }
 
            WordParser parser = new WordParser(text, options, prefix);
 
            string? parsedWord;
            while ((parsedWord = parser.NextWord()) != null)
            {
                foreach (string word in words)
                {
                    if (string.Equals(parsedWord, word, StringComparison.OrdinalIgnoreCase))
                    {
                        return true;
                    }
                }
            }
 
            return false;
        }
 
        /// <summary>
        ///     Returns the next word in the text.
        /// </summary>
        /// <returns>
        ///     A <see cref="string"/> containing the next word or <see langword="null"/> if there are no more words.
        /// </returns>
        public string? NextWord()
        {
            if (_peekedWord == null)
            {
                return NextWordCore();
            }
 
            string? word = _peekedWord;
            _peekedWord = null;
            return word;
        }
 
        /// <summary>
        ///     Returns the next word in the text without consuming it.
        /// </summary>
        /// <returns>
        ///     A <see cref="string"/> containing the next word or <see langword="null"/> if there are no more words.
        /// </returns>
        public string? PeekWord()
        {
            _peekedWord ??= NextWordCore();
 
            return _peekedWord;
        }
 
        private string? NextWordCore()
        {
            // Reset buffer
            _buffer.Length = 0;
 
            if (ParseNext())
            { // We parsed something
                return _buffer.ToString();
            }
 
            return null;
        }
 
        private bool ParseNext()
        {
            if (TryParsePrefix())
            {   // Try parse the prefix e.g. 'I' in 'IInterface'.
                return true;
            }
 
            char c;
            char punctuation = NullChar;
 
            while ((c = Peek()) != NullChar)
            {
                if (!TryParseWord(c))
                {
                    if (punctuation != NullChar)
                    { // Intra-word punctuation next to unrecognized character e.g. 'Foo-?'
                        Unread();
                        Skip();
                        return true;
                    }
 
                    // Unrecognized character, ignore
                    Skip();
                    continue;
                }
 
                c = Peek();
 
                if (IsIntraWordPunctuation(c))
                { // Intra-word punctuation e.g. '-' in 'Foo-Bar'
                    punctuation = c;
                    Read();
                    continue;
                }
 
                // We parsed something
                return true;
            }
 
            if (punctuation != NullChar)
            {   // Ends with intra-word punctuation e.g. '-' in 'Foo-'
                Unread();
                return true;
            }
 
            return false;
        }
 
        private bool TryParseWord(char c)
        {
            if (SplitCompoundWords)
            {   // Parse both whole and compound words
                if (IsUpper(c))
                {   // 'ALLCAPS' or 'PascalCased'
                    ParseUppercase();
                    return true;
                }
 
                if (IsLower(c))
                {   // 'foo'
                    ParseLowercase();
                    return true;
                }
 
                if (IsDigit(c))
                {   // '123' or '0xABCDEF'
                    ParseNumeric();
                    return true;
                }
 
                if (IsLetterWithoutCase(c))
                {   // e.g. Japanese characters
                    ParseWithoutCase();
                    return true;
                }
 
                if (c == '#' && IsHexDigit(Peek(2)))
                {   // '#ABC123'
                    ParseHex();
                    return true;
                }
            }
            else if (IsLetterOrDigit(c))
            {   // Parse only whole words
                ParseWholeWord();
                return true;
            }
 
            // Unrecognized character
            return false;
        }
 
        private bool TryParsePrefix()
        {
            if (_prefix == NullChar)
            {
                return false;
            }
 
            char c = Peek();
 
            if (c == _prefix)
            {
                c = Peek(2);
 
                if (!IsLower(c))
                {   // 'IInterface' or 'T1', but not 'Interface', or 'Type'
                    // Consume the prefix
                    Read();
 
                    // We do not want to try and read the prefix again
                    _prefix = NullChar;
                    return true;
                }
            }
 
            // We do not want to try and read the prefix again
            _prefix = NullChar;
            return false;
        }
 
        private void ParseWholeWord()
        {
            char c;
            do
            {
                Read();
                c = Peek();
            }
            while (IsLetterOrDigit(c));
        }
 
        private void ParseInteger()
        {
            char c;
            do
            {
                Read();
                c = Peek();
            }
            while (IsDigit(c));
        }
 
        private void ParseHex()
        {
            char c;
            do
            {
                Read();
                c = Peek();
            }
            while (IsHexDigit(c));
        }
 
        private void ParseNumeric()
        {
            char c = Peek();
 
            if (c == '0')
            {
                c = Peek(2);
 
                if ((c == 'x' || c == 'X') && IsHexDigit(Peek(3)))
                {   // '0xA' or '0XA'
                    Read(); // Consume '0'
                    Read(); // Consume 'x' or 'X'
 
                    ParseHex();
                    return;
                }
            }
 
            ParseInteger();
        }
 
        private void ParseLowercase()
        {
            char c;
            do
            {
                Read();
                c = Peek();
            }
            while (IsLower(c));
        }
 
        private void ParseUppercase()
        {
            Read();
 
            char c = Peek();
 
            if (IsUpper(c))
            {   // 'ALLCAPS'
                ParseAllCaps();
            }
            else if (IsLower(c))
            {   // 'PascalCased'
                ParseLowercase();
            }
        }
 
        private void ParseWithoutCase()
        {
            // Parses letters without any concept of case e.g. Japanese
 
            char c;
            do
            {
                Read();
                c = Peek();
            }
            while (IsLetterWithoutCase(c));
        }
 
        private void ParseAllCaps()
        {
            char c;
 
            // Optimistically consume all consecutive uppercase letters
            do
            {
                Read();
                c = Peek();
            }
            while (IsUpper(c));
 
            // Optimistically consume a trailing 's'
            if (c == 's')
            {
                Read();
                c = Peek();
            }
 
            // Reject the final uppercase letter (and trailing 's')
            // if they are followed by a lower case letter.
            while (IsLower(c))
            {
                Unread();
                c = Peek();
            }
        }
 
        private void Read()
        {
            char c = Peek();
            _buffer.Append(c);
            Skip();
        }
 
        private void Skip()
        {
            while (_index < _text.Length)
            {
                char c = _text[_index++];
 
                if (!IsIgnored(c))
                {
                    break;
                }
            }
        }
 
        private char Peek()
        {
            return Peek(1);
        }
 
        private char Peek(int lookAhead)
        {
            for (int index = _index; index < _text.Length; index++)
            {
                char c = _text[index];
 
                if (IsIgnored(c))
                {
                    continue;
                }
 
                if (--lookAhead == 0)
                {
                    return c;
                }
            }
 
            return NullChar;
        }
 
        private void Unread()
        {
            while (_index >= 0)
            {
                char c = _text[--_index];
 
                if (!IsIgnored(c))
                {
                    break;
                }
            }
 
            _buffer.Length--;
        }
 
        private bool IsIgnored(char c)
        {   // TODO: We should extend this to handle 'real' mnemonics,
            // instead of just blindly skipping all ampersands and
            // underscores.For example, '&&OK' should really be
            // interpreted as '&OK', instead of 'OK'.
            if (SkipMnemonics)
            {
                return c is '&' or '_';
            }
 
            return false;
        }
 
        private static bool IsLower(char c)
        {
            return char.IsLower(c);
        }
 
        private static bool IsUpper(char c)
        {
            return char.IsUpper(c);
        }
 
        private static bool IsLetterOrDigit(char c)
        {
            return char.IsLetterOrDigit(c);
        }
 
        private static bool IsLetterWithoutCase(char c)
        {
            if (char.IsLetter(c) && !char.IsUpper(c))
            {
                return !char.IsLower(c);
            }
 
            return false;
        }
 
        private static bool IsDigit(char c)
        {
            return char.IsDigit(c);
        }
 
        private static bool IsHexDigit(char c)
        {
            return c switch
            {
                'A' or 'a' or 'B' or 'b' or 'C' or 'c' or 'D' or 'd' or 'E' or 'e' or 'F' or 'f' => true,
                _ => IsDigit(c),
            };
        }
 
        private static bool IsIntraWordPunctuation(char c)
        {   // Don't be tempted to add En dash and Em dash to this
            // list, as these should be treated as word delimiters.
 
            return c switch
            {
                '-' or '\u00AD' or '\'' or '\u2019' => true,
                _ => false,
            };
        }
    }
}
File: src\RoslynAnalyzers\Utilities\Compiler\WordParser.cs	Web Access
Project: src\src\RoslynAnalyzers\Text.Analyzers\Core\Text.Analyzers.csproj (Text.Analyzers)