File: System\PercentEncodingHelper.cs
Web Access
Project: src\src\libraries\System.Private.Uri\src\System.Private.Uri.csproj (System.Private.Uri)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers;
using System.Diagnostics;
using System.Text;
 
namespace System
{
    internal static class PercentEncodingHelper
    {
        public static unsafe int UnescapePercentEncodedUTF8Sequence(char* input, int length, ref ValueStringBuilder dest, bool isQuery, bool iriParsing)
        {
            // The following assertions rely on the input not mutating mid-operation, as is the case currently since callers are working with strings
            // If we start accepting input such as spans, this method must be audited to ensure no buffer overruns/infinite loops could occur
 
            // As an optimization, this method should only be called after the first character is known to be a part of a non-ascii UTF8 sequence
            Debug.Assert(length >= 3);
            Debug.Assert(input[0] == '%');
            Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) != Uri.c_DummyChar);
            Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) >= 128);
 
            uint fourByteBuffer = 0;
            int bytesLeftInBuffer = 0;
 
            int totalCharsConsumed = 0;
            int charsToCopy = 0;
            int bytesConsumed = 0;
 
        RefillBuffer:
            int i = totalCharsConsumed + (bytesLeftInBuffer * 3);
 
        ReadByteFromInput:
            if ((uint)(length - i) <= 2 || input[i] != '%')
                goto NoMoreOrInvalidInput;
 
            uint value = input[i + 1];
            if ((uint)((value - 'A') & ~0x20) <= ('F' - 'A'))
            {
                value = (value | 0x20) - 'a' + 10;
            }
            else if ((value - '8') <= ('9' - '8'))
            {
                value -= '0';
            }
            else goto NoMoreOrInvalidInput; // First character wasn't hex or was <= 7F (Ascii)
 
            uint second = (uint)input[i + 2] - '0';
            if (second <= 9)
            {
                // second is already [0, 9]
            }
            else if ((uint)((second - ('A' - '0')) & ~0x20) <= ('F' - 'A'))
            {
                second = ((second + '0') | 0x20) - 'a' + 10;
            }
            else goto NoMoreOrInvalidInput; // Second character wasn't Hex
 
            value = (value << 4) | second;
 
            Debug.Assert(value >= 128);
 
            // Rotate the buffer and overwrite the last byte
            if (BitConverter.IsLittleEndian)
            {
                fourByteBuffer = (fourByteBuffer >> 8) | (value << 24);
            }
            else
            {
                fourByteBuffer = (fourByteBuffer << 8) | value;
            }
 
            if (++bytesLeftInBuffer != 4)
            {
                i += 3;
                goto ReadByteFromInput;
            }
 
        DecodeRune:
            Debug.Assert(totalCharsConsumed % 3 == 0);
            Debug.Assert(bytesLeftInBuffer == 2 || bytesLeftInBuffer == 3 || bytesLeftInBuffer == 4);
            Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00000080 : 0x80000000)) != 0);
            Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00008000 : 0x00800000)) != 0);
            Debug.Assert(bytesLeftInBuffer < 3 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00800000 : 0x00008000)) != 0);
            Debug.Assert(bytesLeftInBuffer < 4 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x80000000 : 0x00000080)) != 0);
 
            uint temp = fourByteBuffer; // make a copy so that the *copy* (not the original) is marked address-taken
            if (Rune.DecodeFromUtf8(new ReadOnlySpan<byte>(&temp, bytesLeftInBuffer), out Rune rune, out bytesConsumed) == OperationStatus.Done)
            {
                Debug.Assert(bytesConsumed >= 2, $"Rune.DecodeFromUtf8 consumed {bytesConsumed} bytes, likely indicating input was modified concurrently during UnescapePercentEncodedUTF8Sequence's execution");
 
                if (!iriParsing || IriHelper.CheckIriUnicodeRange((uint)rune.Value, isQuery))
                {
                    if (charsToCopy != 0)
                    {
                        dest.Append(input + totalCharsConsumed - charsToCopy, charsToCopy);
                        charsToCopy = 0;
                    }
 
                    dest.Append(rune);
                    goto AfterDecodeRune;
                }
            }
            else
            {
                Debug.Assert(bytesConsumed > 0, $"Rune.DecodeFromUtf8 consumed {bytesConsumed} bytes when decoding {bytesLeftInBuffer} bytes");
            }
            charsToCopy += bytesConsumed * 3;
 
        AfterDecodeRune:
            bytesLeftInBuffer -= bytesConsumed;
            totalCharsConsumed += bytesConsumed * 3;
            goto RefillBuffer;
 
        NoMoreOrInvalidInput:
            Debug.Assert(bytesLeftInBuffer < 4);
 
            // If we have more than 1 byte left, we try to decode it
            if (bytesLeftInBuffer > 1)
            {
                Debug.Assert(bytesLeftInBuffer == 2 || bytesLeftInBuffer == 3);
 
                // We reach this branch if we don't have 4 valid bytes to consume
                // We have to allign the read bytes to the start of fourByteBuffer memory
                // We do this by shifting the fourByteBuffer, the shift direction is determined by system endianness
 
                // If we read 3 bytes, we shift by 1; if we read 2, we shift by 2
                // (32 - (bytesLeftInBuffer << 3)) calculates this offset:
                // bytesLeftInBuffer == 3 => (32 - (3 << 3)) => 32 - 24 => 8 bits
                // bytesLeftInBuffer == 2 => (32 - (2 << 3)) => 32 - 16 => 16 bits
 
                // For invalid input we tried to decode in DecodeRune, we may return here if we have more than 1 byte left
                // If bytesConsumed is 1, shift by 1 byte
                // If bytesConsumed is 2:
                // a) We had 4 bytes in the buffer and now only have 2 => Shift by 2 bytes
                // b) We read 1 more byte, leaving us with 3 bytes in the buffer => Shift by 1 byte
                // The case for bytesConsumed == 2 is handled by the else block as the offsets are the same as for valid input described above
 
                if (bytesConsumed == 1)
                {
                    if (BitConverter.IsLittleEndian)
                    {
                        fourByteBuffer >>= 8;
                    }
                    else
                    {
                        fourByteBuffer <<= 8;
                    }
                }
                else
                {
                    if (BitConverter.IsLittleEndian)
                    {
                        fourByteBuffer >>= (32 - (bytesLeftInBuffer << 3));
                    }
                    else
                    {
                        fourByteBuffer <<= (32 - (bytesLeftInBuffer << 3));
                    }
                }
                goto DecodeRune;
            }
 
            Debug.Assert(bytesLeftInBuffer == 0 || bytesLeftInBuffer == 1);
 
            if ((bytesLeftInBuffer | charsToCopy) == 0)
                return totalCharsConsumed;
 
            bytesLeftInBuffer *= 3;
            dest.Append(input + totalCharsConsumed - charsToCopy, charsToCopy + bytesLeftInBuffer);
            return totalCharsConsumed + bytesLeftInBuffer;
        }
    }
}