File: System\Text\Encodings\Web\AllowedBmpCodePointsBitmap.cs
Web Access
Project: src\src\libraries\System.Text.Encodings.Web\src\System.Text.Encodings.Web.csproj (System.Text.Encodings.Web)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers.Binary;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text.Unicode;
 
#if NET
using System.Numerics;
#endif
 
namespace System.Text.Encodings.Web
{
    /// <summary>
    /// A bitmap which represents all 64k codepoints in the
    /// Basic Multilingual Plane.
    /// </summary>
    internal unsafe struct AllowedBmpCodePointsBitmap
    {
        private const int BitmapLengthInDWords = 64 * 1024 / 32;
        private fixed uint Bitmap[BitmapLengthInDWords];
 
        /// <summary>
        /// Adds the given <see cref="char"/> to the bitmap's allow list.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void AllowChar(char value)
        {
            _GetIndexAndOffset(value, out nuint index, out int offset);
            Bitmap[index] |= 1u << offset;
        }
 
        /// <summary>
        /// Removes the given <see cref="char"/> from the bitmap's allow list.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void ForbidChar(char value)
        {
            _GetIndexAndOffset(value, out nuint index, out int offset);
            Bitmap[index] &= ~(1u << offset);
        }
 
        /// <summary>
        /// Removes all HTML-sensitive characters from the bitmap's allow list.
        /// </summary>
        public void ForbidHtmlCharacters()
        {
            ForbidChar('<');
            ForbidChar('>');
            ForbidChar('&');
            ForbidChar('\''); // can be used to escape attributes
            ForbidChar('\"'); // can be used to escape attributes
            ForbidChar('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
        }
 
        /// <summary>
        /// Removes from the bitmap's allow list all code points which aren't mapped to defined characters
        /// or which are otherwise always disallowed.
        /// </summary>
        /// <remarks>
        /// Always-disallowed categories include Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, and Zp.
        /// </remarks>
        public void ForbidUndefinedCharacters()
        {
            fixed (uint* pBitmap = Bitmap)
            {
                ReadOnlySpan<byte> definedCharsBitmapAsLittleEndian = UnicodeHelpers.GetDefinedBmpCodePointsBitmapLittleEndian();
                Span<uint> thisAllowedCharactersBitmap = new Span<uint>(pBitmap, BitmapLengthInDWords);
                Debug.Assert(definedCharsBitmapAsLittleEndian.Length == thisAllowedCharactersBitmap.Length * sizeof(uint));
 
#if NET
                if (Vector.IsHardwareAccelerated && BitConverter.IsLittleEndian)
                {
                    while (!definedCharsBitmapAsLittleEndian.IsEmpty)
                    {
                        (new Vector<uint>(definedCharsBitmapAsLittleEndian) & new Vector<uint>(thisAllowedCharactersBitmap)).CopyTo(thisAllowedCharactersBitmap);
                        definedCharsBitmapAsLittleEndian = definedCharsBitmapAsLittleEndian.Slice(Vector<byte>.Count);
                        thisAllowedCharactersBitmap = thisAllowedCharactersBitmap.Slice(Vector<uint>.Count);
                    }
                    Debug.Assert(thisAllowedCharactersBitmap.IsEmpty, "Both vectors should've been fully consumed.");
                    return;
                }
#endif
 
                // Not Core, or not little-endian, or not SIMD-optimized.
                for (int i = 0; i < thisAllowedCharactersBitmap.Length; i++)
                {
                    thisAllowedCharactersBitmap[i] &= BinaryPrimitives.ReadUInt32LittleEndian(definedCharsBitmapAsLittleEndian.Slice(i * sizeof(uint)));
                }
            }
        }
 
        /// <summary>
        /// Queries the bitmap to see if the given <see cref="char"/> is in the allow list.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public readonly bool IsCharAllowed(char value)
        {
            // No bounds checks required: every char maps to a valid position in the bitmap
            _GetIndexAndOffset(value, out nuint index, out int offset);
            if ((Bitmap[index] & (1u << offset)) != 0) { return true; }
            else { return false; }
        }
 
        /// <summary>
        /// Queries the bitmap to see if the given code point is in the allow list.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public readonly bool IsCodePointAllowed(uint value)
        {
            if (!UnicodeUtility.IsBmpCodePoint(value)) { return false; } // we only understand BMP
            _GetIndexAndOffset(value, out nuint index, out int offset);
            if ((Bitmap[index] & (1u << offset)) != 0) { return true; }
            else { return false; }
        }
 
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static void _GetIndexAndOffset(uint value, out nuint index, out int offset)
        {
            UnicodeDebug.AssertIsBmpCodePoint(value);
            index = value >> 5;
            offset = (int)value & 0x1F;
        }
    }
}