File: src\libraries\System.Private.CoreLib\src\System\Text\EncoderNLS.cs
Web Access
Project: src\src\coreclr\System.Private.CoreLib\System.Private.CoreLib.csproj (System.Private.CoreLib)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers;
using System.Diagnostics;
using System.Runtime.InteropServices;
 
namespace System.Text
{
    // An Encoder is used to encode a sequence of blocks of characters into
    // a sequence of blocks of bytes. Following instantiation of an encoder,
    // sequential blocks of characters are converted into blocks of bytes through
    // calls to the GetBytes method. The encoder maintains state between the
    // conversions, allowing it to correctly encode character sequences that span
    // adjacent blocks.
    //
    // Instances of specific implementations of the Encoder abstract base
    // class are typically obtained through calls to the GetEncoder method
    // of Encoding objects.
    //
 
    internal class EncoderNLS : Encoder
    {
        // Need a place for the last left over character, most of our encodings use this
        internal char _charLeftOver;
        private readonly Encoding _encoding;
        private bool _mustFlush;
        internal bool _throwOnOverflow;
        internal int _charsUsed;
 
        internal EncoderNLS(Encoding encoding)
        {
            _encoding = encoding;
            _fallback = _encoding.EncoderFallback;
            this.Reset();
        }
 
        public override void Reset()
        {
            _charLeftOver = (char)0;
            _fallbackBuffer?.Reset();
        }
 
        public override unsafe int GetByteCount(char[] chars, int index, int count, bool flush)
        {
            ArgumentNullException.ThrowIfNull(chars);
 
            ArgumentOutOfRangeException.ThrowIfNegative(index);
            ArgumentOutOfRangeException.ThrowIfNegative(count);
 
            if (chars.Length - index < count)
                throw new ArgumentOutOfRangeException(nameof(chars),
                      SR.ArgumentOutOfRange_IndexCountBuffer);
 
            // Just call the pointer version
            int result = -1;
            fixed (char* pChars = &MemoryMarshal.GetArrayDataReference(chars))
            {
                result = GetByteCount(pChars + index, count, flush);
            }
            return result;
        }
 
        public override unsafe int GetByteCount(char* chars, int count, bool flush)
        {
            ArgumentNullException.ThrowIfNull(chars);
 
            ArgumentOutOfRangeException.ThrowIfNegative(count);
 
            _mustFlush = flush;
            _throwOnOverflow = true;
            Debug.Assert(_encoding is not null);
            return _encoding.GetByteCount(chars, count, this);
        }
 
        public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
                                            byte[] bytes, int byteIndex, bool flush)
        {
            ArgumentNullException.ThrowIfNull(chars);
            ArgumentNullException.ThrowIfNull(bytes);
 
            ArgumentOutOfRangeException.ThrowIfNegative(charIndex);
            ArgumentOutOfRangeException.ThrowIfNegative(charCount);
 
            if (chars.Length - charIndex < charCount)
                throw new ArgumentOutOfRangeException(nameof(chars),
                      SR.ArgumentOutOfRange_IndexCountBuffer);
 
            if (byteIndex < 0 || byteIndex > bytes.Length)
                throw new ArgumentOutOfRangeException(nameof(byteIndex),
                     SR.ArgumentOutOfRange_IndexMustBeLessOrEqual);
 
            int byteCount = bytes.Length - byteIndex;
 
            // Just call pointer version
            fixed (char* pChars = &MemoryMarshal.GetArrayDataReference(chars))
            fixed (byte* pBytes = &MemoryMarshal.GetArrayDataReference(bytes))
            {
                // Remember that charCount is # to decode, not size of array.
                return GetBytes(pChars + charIndex, charCount,
                                pBytes + byteIndex, byteCount, flush);
            }
        }
 
        public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush)
        {
            ArgumentNullException.ThrowIfNull(chars);
            ArgumentNullException.ThrowIfNull(bytes);
 
            ArgumentOutOfRangeException.ThrowIfNegative(byteCount);
            ArgumentOutOfRangeException.ThrowIfNegative(charCount);
 
            _mustFlush = flush;
            _throwOnOverflow = true;
            Debug.Assert(_encoding is not null);
            return _encoding.GetBytes(chars, charCount, bytes, byteCount, this);
        }
 
        // This method is used when your output buffer might not be large enough for the entire result.
        // Just call the pointer version.  (This gets bytes)
        public override unsafe void Convert(char[] chars, int charIndex, int charCount,
                                            byte[] bytes, int byteIndex, int byteCount, bool flush,
                                            out int charsUsed, out int bytesUsed, out bool completed)
        {
            ArgumentNullException.ThrowIfNull(chars);
            ArgumentNullException.ThrowIfNull(bytes);
 
            ArgumentOutOfRangeException.ThrowIfNegative(charIndex);
            ArgumentOutOfRangeException.ThrowIfNegative(charCount);
 
            ArgumentOutOfRangeException.ThrowIfNegative(byteIndex);
            ArgumentOutOfRangeException.ThrowIfNegative(byteCount);
 
            if (chars.Length - charIndex < charCount)
                throw new ArgumentOutOfRangeException(nameof(chars),
                      SR.ArgumentOutOfRange_IndexCountBuffer);
 
            if (bytes.Length - byteIndex < byteCount)
                throw new ArgumentOutOfRangeException(nameof(bytes),
                      SR.ArgumentOutOfRange_IndexCountBuffer);
 
            // Just call the pointer version (can't do this for non-msft encoders)
            fixed (char* pChars = &MemoryMarshal.GetArrayDataReference(chars))
            fixed (byte* pBytes = &MemoryMarshal.GetArrayDataReference(bytes))
            {
                Convert(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, flush,
                    out charsUsed, out bytesUsed, out completed);
            }
        }
 
        // This is the version that uses pointers.  We call the base encoding worker function
        // after setting our appropriate internal variables.  This is getting bytes
        public override unsafe void Convert(char* chars, int charCount,
                                            byte* bytes, int byteCount, bool flush,
                                            out int charsUsed, out int bytesUsed, out bool completed)
        {
            ArgumentNullException.ThrowIfNull(chars);
            ArgumentNullException.ThrowIfNull(bytes);
 
            ArgumentOutOfRangeException.ThrowIfNegative(charCount);
            ArgumentOutOfRangeException.ThrowIfNegative(byteCount);
 
            // We don't want to throw
            _mustFlush = flush;
            _throwOnOverflow = false;
            _charsUsed = 0;
 
            // Do conversion
            Debug.Assert(_encoding is not null);
            bytesUsed = _encoding.GetBytes(chars, charCount, bytes, byteCount, this);
            charsUsed = _charsUsed;
 
            // If the 'completed' out parameter is set to false, it means one of two things:
            // a) this call to Convert did not consume the entire source buffer; or
            // b) this call to Convert did consume the entire source buffer, but there's
            //    still pending data that needs to be written to the destination buffer.
            //
            // In either case, the caller should slice the input buffer, provide a fresh
            // destination buffer, and call Convert again in a loop until 'completed' is true.
            //
            // The caller *must* specify flush = true on the final iteration(s) of the loop
            // and iterate until 'completed' is set to true. Otherwise data loss may occur.
            //
            // Technically, the expected logic is detailed below.
            //
            // If 'flush' = false, the 'completed' parameter MUST be set to false if not all
            // elements of the source buffer have been consumed. The 'completed' parameter MUST
            // be set to true once the entire source buffer has been consumed and there is no
            // pending data for the destination buffer. (In other words, the 'completed' parameter
            // MUST be set to true if passing a zero-length source buffer and an infinite-length
            // destination buffer will make no forward progress.) The 'completed' parameter value
            // is undefined for the case where all source data has been consumed but there remains
            // pending data for the destination buffer.
            //
            // If 'flush' = true, the 'completed' parameter is set to true IF AND ONLY IF:
            // a) all elements of the source buffer have been transcoded into the destination buffer; AND
            // b) there remains no internal partial read state within this instance; AND
            // c) there remains no pending data for the destination buffer.
            //
            // In other words, if 'flush' = true, then when 'completed' is set to true it should mean
            // that all data has been converted and that this instance is indistinguishable from a
            // freshly-reset instance.
 
            completed = (charsUsed == charCount)
                && (!flush || !this.HasState)
                && (_fallbackBuffer is null || _fallbackBuffer.Remaining == 0);
        }
 
        public Encoding Encoding
        {
            get
            {
                Debug.Assert(_encoding is not null);
                return _encoding;
            }
        }
 
        public bool MustFlush => _mustFlush;
 
        /// <summary>
        /// States whether a call to <see cref="Encoding.GetBytes(char*, int, byte*, int, EncoderNLS)"/> must first drain data on this <see cref="EncoderNLS"/> instance.
        /// </summary>
        internal bool HasLeftoverData => _charLeftOver != default || (_fallbackBuffer is not null && _fallbackBuffer.Remaining > 0);
 
        // Anything left in our encoder?
        internal virtual bool HasState => _charLeftOver != (char)0;
 
        // Allow encoding to clear our must flush instead of throwing (in ThrowBytesOverflow)
        internal void ClearMustFlush()
        {
            _mustFlush = false;
        }
 
        internal int DrainLeftoverDataForGetByteCount(ReadOnlySpan<char> chars, out int charsConsumed)
        {
            // Quick check: we _should not_ have leftover fallback data from a previous invocation,
            // as we'd end up consuming any such data and would corrupt whatever Convert call happens
            // to be in progress.
 
            if (_fallbackBuffer is not null && _fallbackBuffer.Remaining > 0)
            {
                throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, Encoding.EncodingName, _fallbackBuffer.GetType()));
            }
 
            // If we have a leftover high surrogate from a previous operation, consume it now.
            // We won't clear the _charLeftOver field since GetByteCount is supposed to be
            // a non-mutating operation, and we need the field to retain its value for the
            // next call to Convert.
 
            charsConsumed = 0; // could be incorrect, will fix up later in the method
 
            if (_charLeftOver == default)
            {
                return 0; // no leftover high surrogate char - short-circuit and finish
            }
            else
            {
                char secondChar = default;
 
                if (chars.IsEmpty)
                {
                    // If the input buffer is empty and we're not being asked to flush, no-op and return
                    // success to our caller. If we're being asked to flush, the leftover high surrogate from
                    // the previous operation will go through the fallback mechanism by itself.
 
                    if (!MustFlush)
                    {
                        return 0; // no-op = success
                    }
                }
                else
                {
                    secondChar = chars[0];
                }
 
                // If we have to fallback the chars we're reading immediately below, populate the
                // fallback buffer with the invalid data. We'll just fall through to the "consume
                // fallback buffer" logic at the end of the method.
 
                if (Rune.TryCreate(_charLeftOver, secondChar, out Rune rune))
                {
                    charsConsumed = 1; // consumed the leftover high surrogate + the first char in the input buffer
 
                    Debug.Assert(_encoding is not null);
                    if (_encoding.TryGetByteCount(rune, out int byteCount))
                    {
                        Debug.Assert(byteCount >= 0, "Encoding shouldn't have returned a negative byte count.");
                        return byteCount;
                    }
                    else
                    {
                        // The fallback mechanism relies on a negative index to convey "the start of the invalid
                        // sequence was some number of chars back before the current buffer." In this block and
                        // in the block immediately thereafter, we know we have a single leftover high surrogate
                        // character from a previous operation, so we provide an index of -1 to convey that the
                        // char immediately before the current buffer was the start of the invalid sequence.
 
                        FallbackBuffer.Fallback(_charLeftOver, secondChar, index: -1);
                    }
                }
                else
                {
                    FallbackBuffer.Fallback(_charLeftOver, index: -1);
                }
 
                // Now tally the number of bytes that would've been emitted as part of fallback.
                Debug.Assert(_fallbackBuffer is not null);
                return _fallbackBuffer.DrainRemainingDataForGetByteCount();
            }
        }
 
        internal bool TryDrainLeftoverDataForGetBytes(ReadOnlySpan<char> chars, Span<byte> bytes, out int charsConsumed, out int bytesWritten)
        {
            // We may have a leftover high surrogate data from a previous invocation, or we may have leftover
            // data in the fallback buffer, or we may have neither, but we will never have both. Check for these
            // conditions and handle them now.
 
            charsConsumed = 0; // could be incorrect, will fix up later in the method
            bytesWritten = 0; // could be incorrect, will fix up later in the method
 
            if (_charLeftOver != default)
            {
                char secondChar = default;
 
                if (chars.IsEmpty)
                {
                    // If the input buffer is empty and we're not being asked to flush, no-op and return
                    // success to our caller. If we're being asked to flush, the leftover high surrogate from
                    // the previous operation will go through the fallback mechanism by itself.
 
                    if (!MustFlush)
                    {
                        charsConsumed = 0;
                        bytesWritten = 0;
                        return true; // no-op = success
                    }
                }
                else
                {
                    secondChar = chars[0];
                }
 
                // We're about to consume the leftover char. Make a local copy of it and clear
                // the backing field. We don't bother restoring its value if an exception occurs
                // because exceptional code paths corrupt instance state anyway (e.g., by
                // mutating the fallback buffer contents).
 
                char charLeftOver = _charLeftOver;
                _charLeftOver = default;
 
                // If we have to fallback the chars we're reading immediately below, populate the
                // fallback buffer with the invalid data. We'll just fall through to the "consume
                // fallback buffer" logic at the end of the method.
 
                if (Rune.TryCreate(charLeftOver, secondChar, out Rune rune))
                {
                    charsConsumed = 1; // at the very least, we consumed 1 char from the input
                    Debug.Assert(_encoding is not null);
                    switch (_encoding.EncodeRune(rune, bytes, out bytesWritten))
                    {
                        case OperationStatus.Done:
                            return true; // that's all - we've handled the leftover data
 
                        case OperationStatus.DestinationTooSmall:
                            _encoding.ThrowBytesOverflow(this, nothingEncoded: true); // will throw
                            break;
 
                        case OperationStatus.InvalidData:
                            FallbackBuffer.Fallback(charLeftOver, secondChar, index: -1); // see comment in DrainLeftoverDataForGetByteCount
                            break;
 
                        default:
                            Debug.Fail("Unknown return value.");
                            break;
                    }
                }
                else
                {
                    FallbackBuffer.Fallback(charLeftOver, index: -1); // see comment in DrainLeftoverDataForGetByteCount
                }
            }
 
            // Now check the fallback buffer for any remaining data.
 
            if (_fallbackBuffer is not null && _fallbackBuffer.Remaining > 0)
            {
                return _fallbackBuffer.TryDrainRemainingDataForGetBytes(bytes, out bytesWritten);
            }
 
            // And we're done!
 
            return true; // success
        }
    }
}