File: src\libraries\System.Private.CoreLib\src\System\Text\DecoderNLS.cs
Web Access
Project: src\src\coreclr\System.Private.CoreLib\System.Private.CoreLib.csproj (System.Private.CoreLib)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers;
using System.Diagnostics;
using System.Runtime.InteropServices;
 
namespace System.Text
{
    // A Decoder is used to decode a sequence of blocks of bytes into a
    // sequence of blocks of characters. Following instantiation of a decoder,
    // sequential blocks of bytes are converted into blocks of characters through
    // calls to the GetChars method. The decoder maintains state between the
    // conversions, allowing it to correctly decode byte sequences that span
    // adjacent blocks.
    //
    // Instances of specific implementations of the Decoder abstract base
    // class are typically obtained through calls to the GetDecoder method
    // of Encoding objects.
 
    internal class DecoderNLS : Decoder
    {
        // Remember our encoding
        private readonly Encoding _encoding;
        private bool _mustFlush;
        internal bool _throwOnOverflow;
        internal int _bytesUsed;
        private int _leftoverBytes; // leftover data from a previous invocation of GetChars (up to 4 bytes)
        private int _leftoverByteCount; // number of bytes of actual data in _leftoverBytes
 
        internal DecoderNLS(Encoding encoding)
        {
            _encoding = encoding;
            _fallback = this._encoding.DecoderFallback;
            this.Reset();
        }
 
        public override void Reset()
        {
            ClearLeftoverData();
            _fallbackBuffer?.Reset();
        }
 
        public override int GetCharCount(byte[] bytes, int index, int count)
        {
            return GetCharCount(bytes, index, count, false);
        }
 
        public override unsafe int GetCharCount(byte[] bytes, int index, int count, bool flush)
        {
            ArgumentNullException.ThrowIfNull(bytes);
 
            ArgumentOutOfRangeException.ThrowIfNegative(index);
            ArgumentOutOfRangeException.ThrowIfNegative(count);
 
            if (bytes.Length - index < count)
                throw new ArgumentOutOfRangeException(nameof(bytes),
                    SR.ArgumentOutOfRange_IndexCountBuffer);
 
            // Just call pointer version
            fixed (byte* pBytes = &MemoryMarshal.GetArrayDataReference(bytes))
                return GetCharCount(pBytes + index, count, flush);
        }
 
        public override unsafe int GetCharCount(byte* bytes, int count, bool flush)
        {
            ArgumentNullException.ThrowIfNull(bytes);
 
            ArgumentOutOfRangeException.ThrowIfNegative(count);
 
            // Remember the flush
            _mustFlush = flush;
            _throwOnOverflow = true;
 
            // By default just call the encoding version, no flush by default
            Debug.Assert(_encoding is not null);
            return _encoding.GetCharCount(bytes, count, this);
        }
 
        public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
                                             char[] chars, int charIndex)
        {
            return GetChars(bytes, byteIndex, byteCount, chars, charIndex, false);
        }
 
        public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
                                            char[] chars, int charIndex, bool flush)
        {
            ArgumentNullException.ThrowIfNull(bytes);
            ArgumentNullException.ThrowIfNull(chars);
 
            ArgumentOutOfRangeException.ThrowIfNegative(byteIndex);
            ArgumentOutOfRangeException.ThrowIfNegative(byteCount);
 
            if (bytes.Length - byteIndex < byteCount)
                throw new ArgumentOutOfRangeException(nameof(bytes),
                    SR.ArgumentOutOfRange_IndexCountBuffer);
 
            if (charIndex < 0 || charIndex > chars.Length)
                throw new ArgumentOutOfRangeException(nameof(charIndex),
                    SR.ArgumentOutOfRange_IndexMustBeLessOrEqual);
 
            int charCount = chars.Length - charIndex;
 
            // Just call pointer version
            fixed (byte* pBytes = &MemoryMarshal.GetArrayDataReference(bytes))
            fixed (char* pChars = &MemoryMarshal.GetArrayDataReference(chars))
            {
                // Remember that charCount is # to decode, not size of array
                return GetChars(pBytes + byteIndex, byteCount,
                                pChars + charIndex, charCount, flush);
            }
        }
 
        public override unsafe int GetChars(byte* bytes, int byteCount,
                                            char* chars, int charCount, bool flush)
        {
            ArgumentNullException.ThrowIfNull(bytes);
            ArgumentNullException.ThrowIfNull(chars);
 
            ArgumentOutOfRangeException.ThrowIfNegative(byteCount);
            ArgumentOutOfRangeException.ThrowIfNegative(charCount);
 
            // Remember our flush
            _mustFlush = flush;
            _throwOnOverflow = true;
 
            // By default just call the encodings version
            Debug.Assert(_encoding is not null);
            return _encoding.GetChars(bytes, byteCount, chars, charCount, this);
        }
 
        // This method is used when the output buffer might not be big enough.
        // Just call the pointer version.  (This gets chars)
        public override unsafe void Convert(byte[] bytes, int byteIndex, int byteCount,
                                            char[] chars, int charIndex, int charCount, bool flush,
                                            out int bytesUsed, out int charsUsed, out bool completed)
        {
            ArgumentNullException.ThrowIfNull(bytes);
            ArgumentNullException.ThrowIfNull(chars);
 
            ArgumentOutOfRangeException.ThrowIfNegative(byteIndex);
            ArgumentOutOfRangeException.ThrowIfNegative(byteCount);
 
            ArgumentOutOfRangeException.ThrowIfNegative(charIndex);
            ArgumentOutOfRangeException.ThrowIfNegative(charCount);
 
            if (bytes.Length - byteIndex < byteCount)
                throw new ArgumentOutOfRangeException(nameof(bytes),
                      SR.ArgumentOutOfRange_IndexCountBuffer);
 
            if (chars.Length - charIndex < charCount)
                throw new ArgumentOutOfRangeException(nameof(chars),
                      SR.ArgumentOutOfRange_IndexCountBuffer);
 
            // Just call the pointer version (public overrides can't do this)
            fixed (byte* pBytes = &MemoryMarshal.GetArrayDataReference(bytes))
            fixed (char* pChars = &MemoryMarshal.GetArrayDataReference(chars))
            {
                Convert(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, flush,
                    out bytesUsed, out charsUsed, out completed);
            }
        }
 
        // This is the version that used pointers.  We call the base encoding worker function
        // after setting our appropriate internal variables.  This is getting chars
        public override unsafe void Convert(byte* bytes, int byteCount,
                                              char* chars, int charCount, bool flush,
                                              out int bytesUsed, out int charsUsed, out bool completed)
        {
            ArgumentNullException.ThrowIfNull(bytes);
            ArgumentNullException.ThrowIfNull(chars);
 
            ArgumentOutOfRangeException.ThrowIfNegative(byteCount);
            ArgumentOutOfRangeException.ThrowIfNegative(charCount);
 
            // We don't want to throw
            _mustFlush = flush;
            _throwOnOverflow = false;
            _bytesUsed = 0;
 
            // Do conversion
            Debug.Assert(_encoding is not null);
            charsUsed = _encoding.GetChars(bytes, byteCount, chars, charCount, this);
            bytesUsed = _bytesUsed;
 
            // See comment in EncoderNLS.Convert for the details of the logic below.
 
            completed = (bytesUsed == byteCount)
                && (!flush || !this.HasState)
                && (_fallbackBuffer is null || _fallbackBuffer.Remaining == 0);
        }
 
        public bool MustFlush => _mustFlush;
 
        // Anything left in our decoder?
        internal virtual bool HasState => _leftoverByteCount != 0;
 
        // Allow encoding to clear our must flush instead of throwing (in ThrowCharsOverflow)
        internal void ClearMustFlush()
        {
            _mustFlush = false;
        }
 
        internal ReadOnlySpan<byte> GetLeftoverData() =>
            MemoryMarshal.AsBytes(new ReadOnlySpan<int>(in _leftoverBytes)).Slice(0, _leftoverByteCount);
 
        internal void SetLeftoverData(ReadOnlySpan<byte> bytes)
        {
            bytes.CopyTo(MemoryMarshal.AsBytes(new Span<int>(ref _leftoverBytes)));
            _leftoverByteCount = bytes.Length;
        }
 
        internal bool HasLeftoverData => _leftoverByteCount != 0;
 
        internal void ClearLeftoverData()
        {
            _leftoverByteCount = 0;
        }
 
        internal int DrainLeftoverDataForGetCharCount(ReadOnlySpan<byte> bytes, out int bytesConsumed)
        {
            // Quick check: we _should not_ have leftover fallback data from a previous invocation,
            // as we'd end up consuming any such data and would corrupt whatever Convert call happens
            // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
 
            Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
            Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
 
            // Copy the existing leftover data plus as many bytes as possible of the new incoming data
            // into a temporary concated buffer, then get its char count by decoding it.
 
            Span<byte> combinedBuffer = stackalloc byte[4];
            combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
            int charCount = 0;
 
            Debug.Assert(_encoding is not null);
            switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
            {
                case OperationStatus.Done:
                    charCount = value.Utf16SequenceLength;
                    goto Finish; // successfully transcoded bytes -> chars
 
                case OperationStatus.NeedMoreData:
                    if (MustFlush)
                    {
                        goto case OperationStatus.InvalidData; // treat as equivalent to bad data
                    }
                    else
                    {
                        goto Finish; // consumed some bytes, output 0 chars
                    }
 
                case OperationStatus.InvalidData:
                    break;
 
                default:
                    Debug.Fail("Unexpected OperationStatus return value.");
                    break;
            }
 
            // Couldn't decode the buffer. Fallback the buffer instead. See comment in DrainLeftoverDataForGetChars
            // for more information on why a negative index is provided.
 
            if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount))
            {
                charCount = _fallbackBuffer!.DrainRemainingDataForGetCharCount();
                Debug.Assert(charCount >= 0, "Fallback buffer shouldn't have returned a negative char count.");
            }
 
        Finish:
 
            bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now
            return charCount;
        }
 
        internal int DrainLeftoverDataForGetChars(ReadOnlySpan<byte> bytes, Span<char> chars, out int bytesConsumed)
        {
            // Quick check: we _should not_ have leftover fallback data from a previous invocation,
            // as we'd end up consuming any such data and would corrupt whatever Convert call happens
            // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
 
            Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
            Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
 
            // Copy the existing leftover data plus as many bytes as possible of the new incoming data
            // into a temporary concated buffer, then transcode it from bytes to chars.
 
            Span<byte> combinedBuffer = stackalloc byte[4];
            combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
            int charsWritten = 0;
 
            bool persistNewCombinedBuffer = false;
 
            Debug.Assert(_encoding is not null);
            switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
            {
                case OperationStatus.Done:
                    if (value.TryEncodeToUtf16(chars, out charsWritten))
                    {
                        goto Finish; // successfully transcoded bytes -> chars
                    }
                    else
                    {
                        goto DestinationTooSmall;
                    }
 
                case OperationStatus.NeedMoreData:
                    if (MustFlush)
                    {
                        goto case OperationStatus.InvalidData; // treat as equivalent to bad data
                    }
                    else
                    {
                        persistNewCombinedBuffer = true;
                        goto Finish; // successfully consumed some bytes, output no chars
                    }
 
                case OperationStatus.InvalidData:
                    break;
 
                default:
                    Debug.Fail("Unexpected OperationStatus return value.");
                    break;
            }
 
            // Couldn't decode the buffer. Fallback the buffer instead. The fallback mechanism relies
            // on a negative index to convey "the start of the invalid sequence was some number of
            // bytes back before the current buffer." Since we know the invalid sequence must have
            // started at the beginning of our leftover byte buffer, we can signal to our caller that
            // they must backtrack that many bytes to find the real start of the invalid sequence.
 
            if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount)
                && !_fallbackBuffer!.TryDrainRemainingDataForGetChars(chars, out charsWritten))
            {
                goto DestinationTooSmall;
            }
 
        Finish:
 
            // Report back the number of bytes (from the new incoming span) we consumed just now.
            // This calculation is simple: it's the difference between the original leftover byte
            // count and the number of bytes from the combined buffer we needed to decode the first
            // scalar value. We need to report this before the call to SetLeftoverData /
            // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field.
 
            bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount;
 
            if (persistNewCombinedBuffer)
            {
                Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer.");
                SetLeftoverData(combinedBuffer); // the buffer still only contains partial data; a future call to Convert will need it
            }
            else
            {
                ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths
            }
 
            return charsWritten;
 
        DestinationTooSmall:
 
            // If we got to this point, we're trying to write chars to the output buffer, but we're unable to do
            // so. Unlike EncoderNLS, this type does not allow partial writes to the output buffer. Since we know
            // draining leftover data is the first operation performed by any DecoderNLS API, there was no
            // opportunity for any code before us to make forward progress, so we must fail immediately.
 
            _encoding.ThrowCharsOverflow(this, nothingDecoded: true);
            throw null!; // will never reach this point
        }
 
        /// <summary>
        /// Given a byte buffer <paramref name="dest"/>, concatenates as much of <paramref name="srcLeft"/> followed
        /// by <paramref name="srcRight"/> into it as will fit, then returns the total number of bytes copied.
        /// </summary>
        private static int ConcatInto(ReadOnlySpan<byte> srcLeft, ReadOnlySpan<byte> srcRight, Span<byte> dest)
        {
            int total = 0;
 
            for (int i = 0; i < srcLeft.Length; i++)
            {
                if ((uint)total >= (uint)dest.Length)
                {
                    goto Finish;
                }
                else
                {
                    dest[total++] = srcLeft[i];
                }
            }
 
            for (int i = 0; i < srcRight.Length; i++)
            {
                if ((uint)total >= (uint)dest.Length)
                {
                    goto Finish;
                }
                else
                {
                    dest[total++] = srcRight[i];
                }
            }
 
        Finish:
 
            return total;
        }
    }
}