File: src\libraries\System.Private.CoreLib\src\System\Text\Unicode\Utf8Utility.cs
Web Access
Project: src\src\coreclr\System.Private.CoreLib\System.Private.CoreLib.csproj (System.Private.CoreLib)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if NET
using System.Runtime.Intrinsics;
#endif
 
namespace System.Text.Unicode
{
    internal static partial class Utf8Utility
    {
        /// <summary>
        /// The maximum number of bytes that can result from UTF-8 transcoding
        /// any Unicode scalar value.
        /// </summary>
        internal const int MaxBytesPerScalar = 4;
 
        /// <summary>
        /// Returns the byte index in <paramref name="utf8Data"/> where the first invalid UTF-8 sequence begins,
        /// or -1 if the buffer contains no invalid sequences. Also outs the <paramref name="isAscii"/> parameter
        /// stating whether all data observed (up to the first invalid sequence or the end of the buffer, whichever
        /// comes first) is ASCII.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static unsafe int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
        {
            fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data))
            {
                byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int utf16CodeUnitCountAdjustment, out _);
                int index = (int)(void*)Unsafe.ByteOffset(ref *pUtf8Data, ref *pFirstInvalidByte);
 
                isAscii = (utf16CodeUnitCountAdjustment == 0); // If UTF-16 char count == UTF-8 byte count, it's ASCII.
                return (index < utf8Data.Length) ? index : -1;
            }
        }
 
        /// <summary>
        /// Returns true iff the UInt32 represents four ASCII UTF-8 characters in machine endianness.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool AllBytesInUInt32AreAscii(uint value) => (value & ~0x7F7F_7F7Fu) == 0;
 
        /// <summary>
        /// Returns true iff the UInt64 represents eighty ASCII UTF-8 characters in machine endianness.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool AllBytesInUInt64AreAscii(ulong value) => (value & ~0x7F7F_7F7F_7F7F_7F7Ful) == 0;
 
        /// <summary>
        /// Given a UInt32 that represents four ASCII UTF-8 characters, returns the invariant
        /// lowercase representation of those characters. Requires the input value to contain
        /// four ASCII UTF-8 characters in machine endianness.
        /// </summary>
        /// <remarks>
        /// This is a branchless implementation.
        /// </remarks>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static uint ConvertAllAsciiBytesInUInt32ToLowercase(uint value)
        {
            // ASSUMPTION: Caller has validated that input value is ASCII.
            Debug.Assert(AllBytesInUInt32AreAscii(value));
 
            // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'A'
            uint lowerIndicator = value + 0x8080_8080u - 0x4141_4141u;
 
            // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'Z'
            uint upperIndicator = value + 0x8080_8080u - 0x5B5B_5B5Bu;
 
            // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z'
            uint combinedIndicator = (lowerIndicator ^ upperIndicator);
 
            // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'A' and <= 'Z'
            uint mask = (combinedIndicator & 0x8080_8080u) >> 2;
 
            return value ^ mask; // bit flip uppercase letters [A-Z] => [a-z]
        }
 
        /// <summary>
        /// Given a UInt32 that represents four ASCII UTF-8 characters, returns the invariant
        /// uppercase representation of those characters. Requires the input value to contain
        /// four ASCII UTF-8 characters in machine endianness.
        /// </summary>
        /// <remarks>
        /// This is a branchless implementation.
        /// </remarks>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static uint ConvertAllAsciiBytesInUInt32ToUppercase(uint value)
        {
            // Intrinsified in mono interpreter
            // ASSUMPTION: Caller has validated that input value is ASCII.
            Debug.Assert(AllBytesInUInt32AreAscii(value));
 
            // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'a'
            uint lowerIndicator = value + 0x8080_8080u - 0x6161_6161u;
 
            // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'z'
            uint upperIndicator = value + 0x8080_8080u - 0x7B7B_7B7Bu;
 
            // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z'
            uint combinedIndicator = (lowerIndicator ^ upperIndicator);
 
            // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'a' and <= 'z'
            uint mask = (combinedIndicator & 0x8080_8080u) >> 2;
 
            return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z]
        }
 
        /// <summary>
        /// Given a UInt64 that represents eight ASCII UTF-8 characters, returns the invariant
        /// uppercase representation of those characters. Requires the input value to contain
        /// eight ASCII UTF-8 characters in machine endianness.
        /// </summary>
        /// <remarks>
        /// This is a branchless implementation.
        /// </remarks>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static ulong ConvertAllAsciiBytesInUInt64ToUppercase(ulong value)
        {
            // ASSUMPTION: Caller has validated that input value is ASCII.
            Debug.Assert(AllBytesInUInt64AreAscii(value));
 
            // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'a'
            ulong lowerIndicator = value + 0x8080_8080_8080_8080ul - 0x6161_6161_6161_6161ul;
 
            // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'z'
            ulong upperIndicator = value + 0x8080_8080_8080_8080ul - 0x7B7B_7B7B_7B7B_7B7Bul;
 
            // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z'
            ulong combinedIndicator = (lowerIndicator ^ upperIndicator);
 
            // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'a' and <= 'z'
            ulong mask = (combinedIndicator & 0x8080_8080_8080_8080ul) >> 2;
 
            return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z]
        }
 
        /// <summary>
        /// Given a UInt64 that represents eight ASCII UTF-8 characters, returns the invariant
        /// uppercase representation of those characters. Requires the input value to contain
        /// eight ASCII UTF-8 characters in machine endianness.
        /// </summary>
        /// <remarks>
        /// This is a branchless implementation.
        /// </remarks>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static ulong ConvertAllAsciiBytesInUInt64ToLowercase(ulong value)
        {
            // ASSUMPTION: Caller has validated that input value is ASCII.
            Debug.Assert(AllBytesInUInt64AreAscii(value));
 
            // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'A'
            ulong lowerIndicator = value + 0x8080_8080_8080_8080ul - 0x4141_4141_4141_4141ul;
 
            // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'Z'
            ulong upperIndicator = value + 0x8080_8080_8080_8080ul - 0x5B5B_5B5B_5B5B_5B5Bul;
 
            // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z'
            ulong combinedIndicator = (lowerIndicator ^ upperIndicator);
 
            // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'a' and <= 'z'
            ulong mask = (combinedIndicator & 0x8080_8080_8080_8080ul) >> 2;
 
            return value ^ mask; // bit flip uppercase letters [A-Z] => [a-z]
        }
 
        /// <summary>
        /// Given two UInt32s that represent four ASCII UTF-8 characters each, returns true iff
        /// the two inputs are equal using an ordinal case-insensitive comparison.
        /// </summary>
        /// <remarks>
        /// This is a branchless implementation.
        /// </remarks>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool UInt32OrdinalIgnoreCaseAscii(uint valueA, uint valueB)
        {
            // Not currently intrinsified in mono interpreter, the UTF16 version is
            // ASSUMPTION: Caller has validated that input values are ASCII.
            Debug.Assert(AllBytesInUInt32AreAscii(valueA));
            Debug.Assert(AllBytesInUInt32AreAscii(valueB));
 
            // The logic here is very simple and is doing SIMD Within A Register (SWAR)
            //
            // First we want to create a mask finding the upper-case ASCII characters
            //
            // To do that, we can take the above presumption that all characters are ASCII
            // and therefore between 0x00 and 0x7F, inclusive. This means that `0x80 + char`
            // will never overflow and will at most produce 0xFF.
            //
            // Given that, we can check if a byte is greater than a value by adding it to
            // 0x80 and then subtracting the constant we're comparing against. So, for example,
            // if we want to find all characters greater than 'A' we do `value + 0x80 - 'A'`.
            //
            // Given that 'A' is 0x41, we end up with `0x41 + 0x80 == 0xC1` then we subtract 'A'
            // giving us `0xC1 - 0x41 == 0x80` and up to `0xBE` for 'DEL' (0x7F). This means that
            // any character greater than or equal to 'A' will have the most significant bit set.
            //
            // This can itself be simplified down to `val + (0x80 - 'A')` or `val + 0x3F`
            //
            // We also want to find the characters less than or equal to 'Z' as well. This follows
            // the same general principle but relies on finding the inverse instead. That is, we
            // want to find all characters greater than or equal to ('Z' + 1) and then inverse it.
            //
            // To confirm this, lets look at 'Z' which has the value of '0x5A'. So we first do
            // `0x5A + 0x80 == 0xDA`, then we subtract `[' (0x5B) giving us `0xDA - 0x5B == 0x80`.
            // This means that any character greater than 'Z' will now have the most significant bit set.
            //
            // It then follows that taking the ones complement will give us a mask representing the bytes
            // which are less than or equal to 'Z' since `!(val >= 0x5B) == (val <= 0x5A)`
            //
            // This then gives us that `('A' <= val) && (val <= 'Z')` is representable as
            // `(val + 0x3F) & ~(val + 0x25)`
            //
            // However, since a `val` cannot be simultaneously less than 'A' and greater than 'Z' we
            // are able to simplify this further to being just `(val + 0x3F) ^ (val + 0x25)`
            //
            // We then want to mask off the excess bits that aren't important to the mask and right
            // shift by two. This gives us `0x20` for a byte which is an upper-case ASCII character
            // and `0x00` otherwise.
            //
            // We now have a super efficient implementation that does a correct comparison in
            // 12 instructions and with zero branching.
 
            uint letterMaskA = (((valueA + 0x3F3F3F3F) ^ (valueA + 0x25252525)) & 0x80808080) >> 2;
            uint letterMaskB = (((valueB + 0x3F3F3F3F) ^ (valueB + 0x25252525)) & 0x80808080) >> 2;
 
            return (valueA | letterMaskA) == (valueB | letterMaskB);
        }
 
        /// <summary>
        /// Given two UInt64s that represent eight ASCII UTF-8 characters each, returns true iff
        /// the two inputs are equal using an ordinal case-insensitive comparison.
        /// </summary>
        /// <remarks>
        /// This is a branchless implementation.
        /// </remarks>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB)
        {
            // Not currently intrinsified in mono interpreter, the UTF16 version is
            // ASSUMPTION: Caller has validated that input values are ASCII.
            Debug.Assert(AllBytesInUInt64AreAscii(valueA));
            Debug.Assert(AllBytesInUInt64AreAscii(valueB));
 
            // Duplicate of logic in UInt32OrdinalIgnoreCaseAscii, but using 64-bit consts.
            // See comments in that method for more info.
 
            ulong letterMaskA = (((valueA + 0x3F3F3F3F3F3F3F3F) ^ (valueA + 0x2525252525252525)) & 0x8080808080808080) >> 2;
            ulong letterMaskB = (((valueB + 0x3F3F3F3F3F3F3F3F) ^ (valueB + 0x2525252525252525)) & 0x8080808080808080) >> 2;
 
            return (valueA | letterMaskA) == (valueB | letterMaskB);
        }
 
#if NET
        /// <summary>
        /// Returns true iff the Vector128 represents 16 ASCII UTF-8 characters in machine endianness.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool AllBytesInVector128AreAscii(Vector128<byte> vec)
        {
            return (vec & Vector128.Create(unchecked((byte)(~0x7F)))) == Vector128<byte>.Zero;
        }
 
        /// <summary>
        /// Given two Vector128 that represent 16 ASCII UTF-8 characters each, returns true iff
        /// the two inputs are equal using an ordinal case-insensitive comparison.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static bool Vector128OrdinalIgnoreCaseAscii(Vector128<byte> vec1, Vector128<byte> vec2)
        {
            // ASSUMPTION: Caller has validated that input values are ASCII.
 
            // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
            Vector128<sbyte> lowIndicator1 = Vector128.Create((sbyte)(0x80 - 'A')) + vec1.AsSByte();
            Vector128<sbyte> lowIndicator2 = Vector128.Create((sbyte)(0x80 - 'A')) + vec2.AsSByte();
 
            // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z'
            Vector128<sbyte> combIndicator1 =
                Vector128.LessThan(Vector128.Create(unchecked((sbyte)(('Z' - 'A') - 0x80))), lowIndicator1);
            Vector128<sbyte> combIndicator2 =
                Vector128.LessThan(Vector128.Create(unchecked((sbyte)(('Z' - 'A') - 0x80))), lowIndicator2);
 
            // Convert both vectors to lower case by adding 0x20 bit for all [A-Z][a-z] characters
            Vector128<sbyte> lcVec1 =
                Vector128.AndNot(Vector128.Create((sbyte)0x20), combIndicator1) + vec1.AsSByte();
            Vector128<sbyte> lcVec2 =
                Vector128.AndNot(Vector128.Create((sbyte)0x20), combIndicator2) + vec2.AsSByte();
 
            // Compare two lowercased vectors
            return (lcVec1 ^ lcVec2) == Vector128<sbyte>.Zero;
        }
#endif
    }
}