RegexCharClass.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
// LICENSING NOTE: The license for this file is from the originating 
// source and not the general https://github.com/dotnet/roslyn license.
// See https://github.com/dotnet/runtime/blob/5b5bd46c03c86f8545f2c4c8628ac25d875210fe/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
 
using System;
using System.Collections.Generic;
using System.Globalization;
using Microsoft.CodeAnalysis.EmbeddedLanguages.VirtualChars;
 
namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions;
 
using static FeaturesResources;
 
/// <summary>
/// Minimal copy of https://github.com/dotnet/corefx/blob/main/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
/// Used to accurately determine if something is a WordChar according to the .NET regex engine.
/// </summary>
internal static class RegexCharClass
{
    public static readonly Dictionary<string, (string shortDescription, string longDescription)> EscapeCategories =
        new()
        {
            // Others
            { "Cc", (Regex_other_control, "") },
            { "Cf", (Regex_other_format, "") },
            { "Cn", (Regex_other_not_assigned, "") },
            { "Co", (Regex_other_private_use, "") },
            { "Cs", (Regex_other_surrogate, "") },
            { "C", (Regex_all_control_characters_short, Regex_all_control_characters_long) },
            // Letters
            { "Ll", (Regex_letter_lowercase, "") },
            { "Lm", (Regex_letter_modifier, "") },
            { "Lo", (Regex_letter_other, "") },
            { "Lt", (Regex_letter_titlecase, "") },
            { "Lu", (Regex_letter_uppercase, "") },
            { "L", (Regex_all_letter_characters_short, Regex_all_letter_characters_long) },
            // Marks
            { "Mc", (Regex_mark_spacing_combining, "") },
            { "Me", (Regex_mark_enclosing, "") },
            { "Mn", (Regex_mark_nonspacing, "") },
            { "M", (Regex_all_diacritic_marks_short, Regex_all_diacritic_marks_long) },
            // Numbers
            { "Nd", (Regex_number_decimal_digit, "") },
            { "Nl", (Regex_number_letter, "") },
            { "No", (Regex_number_other, "") },
            { "N", (Regex_all_numbers_short, Regex_all_numbers_long) },
            // Punctuation
            { "Pc", (Regex_punctuation_connector, "") },
            { "Pd", (Regex_punctuation_dash, "") },
            { "Pe", (Regex_punctuation_close, "") },
            { "Po", (Regex_punctuation_other, "") },
            { "Ps", (Regex_punctuation_open, "") },
            { "Pf", (Regex_punctuation_final_quote, "") },
            { "Pi", (Regex_punctuation_initial_quote, "") },
            { "P", (Regex_all_punctuation_characters_short, Regex_all_punctuation_characters_long) },
            // Symbols
            { "Sc", (Regex_symbol_currency, "") },
            { "Sk", (Regex_symbol_modifier, "") },
            { "Sm", (Regex_symbol_math, "") },
            { "So", (Regex_symbol_other, "") },
            { "S", (Regex_all_symbols_short, Regex_all_symbols_long) },
            // Separators
            { "Zl", (Regex_separator_line, "") },
            { "Zp", (Regex_separator_paragraph, "") },
            { "Zs", (Regex_separator_space, "") },
            { "Z", (Regex_all_separator_characters_short, Regex_all_separator_characters_long) },
 
            { "IsAlphabeticPresentationForms", ("", "") },
            { "IsArabic", ("", "") },
            { "IsArabicPresentationForms-A", ("", "") },
            { "IsArabicPresentationForms-B", ("", "") },
            { "IsArmenian", ("", "") },
            { "IsArrows", ("", "") },
            { "IsBasicLatin", ("", "") },
            { "IsBengali", ("", "") },
            { "IsBlockElements", ("", "") },
            { "IsBopomofo", ("", "") },
            { "IsBopomofoExtended", ("", "") },
            { "IsBoxDrawing", ("", "") },
            { "IsBraillePatterns", ("", "") },
            { "IsBuhid", ("", "") },
            { "IsCJKCompatibility", ("", "") },
            { "IsCJKCompatibilityForms", ("", "") },
            { "IsCJKCompatibilityIdeographs", ("", "") },
            { "IsCJKRadicalsSupplement", ("", "") },
            { "IsCJKSymbolsandPunctuation", ("", "") },
            { "IsCJKUnifiedIdeographs", ("", "") },
            { "IsCJKUnifiedIdeographsExtensionA", ("", "") },
            { "IsCherokee", ("", "") },
            { "IsCombiningDiacriticalMarks", ("", "") },
            { "IsCombiningDiacriticalMarksforSymbols", ("", "") },
            { "IsCombiningHalfMarks", ("", "") },
            { "IsCombiningMarksforSymbols", ("", "") },
            { "IsControlPictures", ("", "") },
            { "IsCurrencySymbols", ("", "") },
            { "IsCyrillic", ("", "") },
            { "IsCyrillicSupplement", ("", "") },
            { "IsDevanagari", ("", "") },
            { "IsDingbats", ("", "") },
            { "IsEnclosedAlphanumerics", ("", "") },
            { "IsEnclosedCJKLettersandMonths", ("", "") },
            { "IsEthiopic", ("", "") },
            { "IsGeneralPunctuation", ("", "") },
            { "IsGeometricShapes", ("", "") },
            { "IsGeorgian", ("", "") },
            { "IsGreek", ("", "") },
            { "IsGreekExtended", ("", "") },
            { "IsGreekandCoptic", ("", "") },
            { "IsGujarati", ("", "") },
            { "IsGurmukhi", ("", "") },
            { "IsHalfwidthandFullwidthForms", ("", "") },
            { "IsHangulCompatibilityJamo", ("", "") },
            { "IsHangulJamo", ("", "") },
            { "IsHangulSyllables", ("", "") },
            { "IsHanunoo", ("", "") },
            { "IsHebrew", ("", "") },
            { "IsHighPrivateUseSurrogates", ("", "") },
            { "IsHighSurrogates", ("", "") },
            { "IsHiragana", ("", "") },
            { "IsIPAExtensions", ("", "") },
            { "IsIdeographicDescriptionCharacters", ("", "") },
            { "IsKanbun", ("", "") },
            { "IsKangxiRadicals", ("", "") },
            { "IsKannada", ("", "") },
            { "IsKatakana", ("", "") },
            { "IsKatakanaPhoneticExtensions", ("", "") },
            { "IsKhmer", ("", "") },
            { "IsKhmerSymbols", ("", "") },
            { "IsLao", ("", "") },
            { "IsLatin-1Supplement", ("", "") },
            { "IsLatinExtended-A", ("", "") },
            { "IsLatinExtended-B", ("", "") },
            { "IsLatinExtendedAdditional", ("", "") },
            { "IsLetterlikeSymbols", ("", "") },
            { "IsLimbu", ("", "") },
            { "IsLowSurrogates", ("", "") },
            { "IsMalayalam", ("", "") },
            { "IsMathematicalOperators", ("", "") },
            { "IsMiscellaneousMathematicalSymbols-A", ("", "") },
            { "IsMiscellaneousMathematicalSymbols-B", ("", "") },
            { "IsMiscellaneousSymbols", ("", "") },
            { "IsMiscellaneousSymbolsandArrows", ("", "") },
            { "IsMiscellaneousTechnical", ("", "") },
            { "IsMongolian", ("", "") },
            { "IsMyanmar", ("", "") },
            { "IsNumberForms", ("", "") },
            { "IsOgham", ("", "") },
            { "IsOpticalCharacterRecognition", ("", "") },
            { "IsOriya", ("", "") },
            { "IsPhoneticExtensions", ("", "") },
            { "IsPrivateUse", ("", "") },
            { "IsPrivateUseArea", ("", "") },
            { "IsRunic", ("", "") },
            { "IsSinhala", ("", "") },
            { "IsSmallFormVariants", ("", "") },
            { "IsSpacingModifierLetters", ("", "") },
            { "IsSpecials", ("", "") },
            { "IsSuperscriptsandSubscripts", ("", "") },
            { "IsSupplementalArrows-A", ("", "") },
            { "IsSupplementalArrows-B", ("", "") },
            { "IsSupplementalMathematicalOperators", ("", "") },
            { "IsSyriac", ("", "") },
            { "IsTagalog", ("", "") },
            { "IsTagbanwa", ("", "") },
            { "IsTaiLe", ("", "") },
            { "IsTamil", ("", "") },
            { "IsTelugu", ("", "") },
            { "IsThaana", ("", "") },
            { "IsThai", ("", "") },
            { "IsTibetan", ("", "") },
            { "IsUnifiedCanadianAboriginalSyllabics", ("", "") },
            { "IsVariationSelectors", ("", "") },
            { "IsYiRadicals", ("", "") },
            { "IsYiSyllables", ("", "") },
            { "IsYijingHexagramSymbols", ("", "") },
            { "_xmlC", ("", "") },
            { "_xmlD", ("", "") },
            { "_xmlI", ("", "") },
            { "_xmlW", ("", "") },
        };
 
    public static bool IsEscapeCategory(string value)
        => EscapeCategories.ContainsKey(value);
 
    public static bool IsBoundaryWordChar(VirtualChar r)
    {
        // unicode characters that do not fit in 16bits are not supported by 
        // .net regex system.
        if (r.Value > char.MaxValue)
            return false;
 
        var ch = (char)r.Value;
 
        // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
        // RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
        // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
        // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
 
        // Fast lookup in our lookup table for ASCII characters.  This is purely an optimization, and has the
        // behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
        ReadOnlySpan<byte> asciiLookup = [0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07];
        var chDiv8 = ch >> 3;
        if ((uint)chDiv8 < (uint)asciiLookup.Length)
        {
            return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
        }
 
        // For non-ASCII, fall back to checking the Unicode category.
        switch (CharUnicodeInfo.GetUnicodeCategory(ch))
        {
            case UnicodeCategory.UppercaseLetter:
            case UnicodeCategory.LowercaseLetter:
            case UnicodeCategory.TitlecaseLetter:
            case UnicodeCategory.ModifierLetter:
            case UnicodeCategory.OtherLetter:
            case UnicodeCategory.NonSpacingMark:
            case UnicodeCategory.DecimalDigitNumber:
            case UnicodeCategory.ConnectorPunctuation:
                return true;
 
            default:
                const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
                return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;
        }
    }
}
File: EmbeddedLanguages\RegularExpressions\RegexCharClass.cs	Web Access
Project: src\src\Features\Core\Portable\Microsoft.CodeAnalysis.Features.csproj (Microsoft.CodeAnalysis.Features)