File: System\Formats\Tar\GnuSparseStream.cs
Web Access
Project: src\src\libraries\System.Formats.Tar\src\System.Formats.Tar.csproj (System.Formats.Tar)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Buffers;
using System.Buffers.Text;
using System.Diagnostics;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
 
namespace System.Formats.Tar
{
    // Stream that wraps the raw data section of a GNU sparse format 1.0 PAX entry and
    // expands it to the virtual file size by inserting zeros for sparse holes.
    //
    // The raw data section layout:
    //   [sparse map text: numSegs\n, then pairs of offset\n numbytes\n]
    //   [zero padding to the next 512-byte block boundary]
    //   [packed non-zero data segments, stored sequentially]
    //
    // This stream presents a virtual file of size 'realSize' where:
    //   - regions covered by sparse map segments contain the packed data
    //   - all other regions (holes) read as zero bytes
    internal sealed class GnuSparseStream : Stream
    {
        // Caps the segment count to prevent excessive memory allocation from malformed archives.
        // Each segment entry in the array occupies 16 bytes, so 1M segments = 16 MB.
        private const int MaxSparseSegments = 1_000_000;
 
        private readonly Stream _rawStream;
        private bool _isDisposed;
        private readonly long _realSize;
 
        // Sparse map state — initialized lazily on first Read to avoid consuming the raw
        // stream before TarWriter has a chance to copy the condensed data.
        private (long Offset, long Length)[]? _segments;
        private long[]? _packedStartOffsets;
 
        private long _virtualPosition; // current position in the virtual (expanded) file
 
        // For non-seekable streams: tracks how many bytes of packed data have been consumed
        // so we can skip forward when there are holes between segments.
        private long _nextPackedOffset;
 
        // Cached segment index for sequential read optimization.
        // For typical forward sequential reads, this avoids repeated binary searches.
        private int _currentSegmentIndex;
 
        internal GnuSparseStream(Stream rawStream, long realSize)
        {
            _rawStream = rawStream;
            _realSize = realSize;
        }
 
        // Parses the sparse map on first read. Populates _segments, _packedStartOffsets,
        // and _dataStart. Throws InvalidDataException if the sparse map is malformed.
        private void EnsureInitialized()
        {
            if (_segments is not null)
            {
                return;
            }
 
            var segments = ParseSparseMap(isAsync: false, _rawStream, CancellationToken.None).GetAwaiter().GetResult();
            InitializeFromParsedMap(segments);
        }
 
        private async ValueTask EnsureInitializedAsync(CancellationToken cancellationToken)
        {
            if (_segments is not null)
            {
                return;
            }
 
            var segments = await ParseSparseMap(isAsync: true, _rawStream, cancellationToken).ConfigureAwait(false);
            InitializeFromParsedMap(segments);
        }
 
        private void InitializeFromParsedMap((long Offset, long Length)[] segments)
        {
            _packedStartOffsets = new long[segments.Length];
            long sum = 0;
            long previousEnd = 0;
            for (int i = 0; i < segments.Length; i++)
            {
                var (offset, length) = segments[i];
 
                // Validate segment ordering and bounds. Avoid overflow by checking length separately.
                if (offset < previousEnd || offset > _realSize || length > _realSize - offset)
                {
                    throw new InvalidDataException(SR.TarInvalidNumber);
                }
                previousEnd = offset + length;
 
                _packedStartOffsets[i] = sum;
                try
                {
                    sum = checked(sum + length);
                }
                catch (OverflowException ex)
                {
                    throw new InvalidDataException(SR.TarInvalidNumber, ex);
                }
            }
            // Assign _segments last — it serves as the initialization flag.
            _segments = segments;
        }
 
        public override bool CanRead => !_isDisposed;
        public override bool CanSeek => !_isDisposed && _rawStream.CanSeek;
        public override bool CanWrite => false;
 
        public override long Length
        {
            get
            {
                ThrowIfDisposed();
                return _realSize;
            }
        }
 
        public override long Position
        {
            get
            {
                ThrowIfDisposed();
                return _virtualPosition;
            }
            set
            {
                ThrowIfDisposed();
                if (!_rawStream.CanSeek)
                {
                    throw new NotSupportedException(SR.IO_NotSupported_UnseekableStream);
                }
                ArgumentOutOfRangeException.ThrowIfNegative(value);
                _virtualPosition = value;
                // _currentSegmentIndex is not reset here; FindSegmentFromCurrent handles
                // backward seeks using binary search.
            }
        }
 
        public override long Seek(long offset, SeekOrigin origin)
        {
            ThrowIfDisposed();
            if (!_rawStream.CanSeek)
            {
                throw new NotSupportedException(SR.IO_NotSupported_UnseekableStream);
            }
 
            long newPosition = origin switch
            {
                SeekOrigin.Begin => offset,
                SeekOrigin.Current => _virtualPosition + offset,
                SeekOrigin.End => _realSize + offset,
                _ => throw new ArgumentOutOfRangeException(nameof(origin)),
            };
 
            if (newPosition < 0)
            {
                throw new IOException(SR.IO_SeekBeforeBegin);
            }
 
            _virtualPosition = newPosition;
            // _currentSegmentIndex is not reset here; FindSegmentFromCurrent handles
            // backward seeks using binary search.
            return _virtualPosition;
        }
 
        public override int Read(byte[] buffer, int offset, int count)
        {
            ValidateBufferArguments(buffer, offset, count);
            return Read(buffer.AsSpan(offset, count));
        }
 
        public override int Read(Span<byte> destination)
        {
            ThrowIfDisposed();
            EnsureInitialized();
            Debug.Assert(_segments is not null && _packedStartOffsets is not null);
 
            if (destination.IsEmpty || _virtualPosition >= _realSize)
            {
                return 0;
            }
 
            int toRead = (int)Math.Min(destination.Length, _realSize - _virtualPosition);
            destination = destination.Slice(0, toRead);
 
            int totalFilled = 0;
            while (totalFilled < toRead)
            {
                long vPos = _virtualPosition + totalFilled;
                int segIdx = FindSegmentFromCurrent(vPos);
 
                if (segIdx < 0)
                {
                    // vPos is in a sparse hole — fill with zeros until the next segment or end of file.
                    long nextSegStart = ~segIdx < _segments.Length ? _segments[~segIdx].Offset : _realSize;
                    int zeroCount = (int)Math.Min(toRead - totalFilled, nextSegStart - vPos);
                    destination.Slice(totalFilled, zeroCount).Clear();
                    totalFilled += zeroCount;
                }
                else
                {
                    // vPos is within segment segIdx — read from packed data.
                    var (segOffset, segLength) = _segments[segIdx];
                    long offsetInSeg = vPos - segOffset;
                    long remainingInSeg = segLength - offsetInSeg;
                    int countToRead = (int)Math.Min(toRead - totalFilled, remainingInSeg);
 
                    long packedOffset = _packedStartOffsets[segIdx] + offsetInSeg;
                    int bytesRead = ReadFromPackedData(destination.Slice(totalFilled, countToRead), packedOffset);
                    totalFilled += bytesRead;
                    break; // Return after an underlying read; caller can call Read again for more.
                }
            }
 
            _virtualPosition += totalFilled;
            return totalFilled;
        }
 
        public override Task<int> ReadAsync(byte[] buffer, int offset, int count, CancellationToken cancellationToken)
        {
            if (cancellationToken.IsCancellationRequested)
            {
                return Task.FromCanceled<int>(cancellationToken);
            }
            ValidateBufferArguments(buffer, offset, count);
            return ReadAsync(new Memory<byte>(buffer, offset, count), cancellationToken).AsTask();
        }
 
        public override ValueTask<int> ReadAsync(Memory<byte> buffer, CancellationToken cancellationToken = default)
        {
            if (cancellationToken.IsCancellationRequested)
            {
                return ValueTask.FromCanceled<int>(cancellationToken);
            }
            ThrowIfDisposed();
            if (buffer.IsEmpty || _virtualPosition >= _realSize)
            {
                return ValueTask.FromResult(0);
            }
            return ReadAsyncCore(buffer, cancellationToken);
        }
 
        private async ValueTask<int> ReadAsyncCore(Memory<byte> buffer, CancellationToken cancellationToken)
        {
            await EnsureInitializedAsync(cancellationToken).ConfigureAwait(false);
            Debug.Assert(_segments is not null && _packedStartOffsets is not null);
 
            int toRead = (int)Math.Min(buffer.Length, _realSize - _virtualPosition);
            buffer = buffer.Slice(0, toRead);
 
            int totalFilled = 0;
            while (totalFilled < toRead)
            {
                long vPos = _virtualPosition + totalFilled;
                int segIdx = FindSegmentFromCurrent(vPos);
 
                if (segIdx < 0)
                {
                    long nextSegStart = ~segIdx < _segments.Length ? _segments[~segIdx].Offset : _realSize;
                    int zeroCount = (int)Math.Min(toRead - totalFilled, nextSegStart - vPos);
                    buffer.Slice(totalFilled, zeroCount).Span.Clear();
                    totalFilled += zeroCount;
                }
                else
                {
                    var (segOffset, segLength) = _segments[segIdx];
                    long offsetInSeg = vPos - segOffset;
                    long remainingInSeg = segLength - offsetInSeg;
                    int countToRead = (int)Math.Min(toRead - totalFilled, remainingInSeg);
 
                    long packedOffset = _packedStartOffsets[segIdx] + offsetInSeg;
                    int bytesRead = await ReadFromPackedDataAsync(buffer.Slice(totalFilled, countToRead), packedOffset, cancellationToken).ConfigureAwait(false);
                    totalFilled += bytesRead;
                    break;
                }
            }
 
            _virtualPosition += totalFilled;
            return totalFilled;
        }
 
        // Exposes the underlying raw stream for callers that need to access the condensed data.
        internal Stream BaseStream => _rawStream;
 
        // Reads from the packed data at the given packedOffset.
        // After EnsureInitialized, the raw stream is positioned at _dataStart and
        // _nextPackedOffset tracks how far into the packed data we've read.
        // Returns the number of bytes actually read (may be less than destination.Length).
        private int ReadFromPackedData(Span<byte> destination, long packedOffset)
        {
            long skipBytes = packedOffset - _nextPackedOffset;
            if (skipBytes < 0 && !_rawStream.CanSeek)
            {
                throw new InvalidOperationException(SR.IO_NotSupported_UnseekableStream);
            }
            if (skipBytes != 0)
            {
                TarHelpers.AdvanceStream(_rawStream, skipBytes);
            }
            int bytesRead = _rawStream.Read(destination);
            _nextPackedOffset = packedOffset + bytesRead;
            return bytesRead;
        }
 
        private async ValueTask<int> ReadFromPackedDataAsync(Memory<byte> destination, long packedOffset, CancellationToken cancellationToken)
        {
            long skipBytes = packedOffset - _nextPackedOffset;
            if (skipBytes < 0 && !_rawStream.CanSeek)
            {
                throw new InvalidOperationException(SR.IO_NotSupported_UnseekableStream);
            }
            if (skipBytes != 0)
            {
                await TarHelpers.AdvanceStreamAsync(_rawStream, skipBytes, cancellationToken).ConfigureAwait(false);
            }
            int bytesRead = await _rawStream.ReadAsync(destination, cancellationToken).ConfigureAwait(false);
            _nextPackedOffset = packedOffset + bytesRead;
            return bytesRead;
        }
 
        // Finds the segment containing virtualPosition.
        // Uses a sequential hint (_currentSegmentIndex) for O(1) amortized forward reads,
        // and falls back to binary search when seeking backward or jumping into already-passed regions.
        // Returns the segment index if found, or the bitwise complement of the
        // insertion point (a negative number) if virtualPosition is in a hole.
        private int FindSegmentFromCurrent(long virtualPosition)
        {
            Debug.Assert(_segments is not null);
 
            if (_segments.Length == 0)
            {
                return ~0;
            }
 
            // If the hint is past all segments, check for backward seek.
            if (_currentSegmentIndex >= _segments.Length)
            {
                long lastEnd = _segments[_segments.Length - 1].Offset + _segments[_segments.Length - 1].Length;
                if (virtualPosition >= lastEnd)
                {
                    // Still in the trailing hole — no search needed.
                    return ~_segments.Length;
                }
                // Seeked back into segment range; use binary search over the full array.
                int result = BinarySearchSegment(virtualPosition, 0, _segments.Length - 1);
                _currentSegmentIndex = result >= 0 ? result : ~result;
                return result;
            }
 
            // If position is before the current hint, use binary search (handles backward seeks).
            if (virtualPosition < _segments[_currentSegmentIndex].Offset)
            {
                int result = BinarySearchSegment(virtualPosition, 0, _currentSegmentIndex - 1);
                _currentSegmentIndex = result >= 0 ? result : ~result;
                return result;
            }
 
            // Scan forward from the current hint (O(1) amortized for sequential reads).
            while (_currentSegmentIndex < _segments.Length)
            {
                var (offset, length) = _segments[_currentSegmentIndex];
                if (virtualPosition < offset)
                {
                    // Position is in a hole before the current segment.
                    return ~_currentSegmentIndex;
                }
                if (virtualPosition < offset + length)
                {
                    // Position is within the current segment.
                    return _currentSegmentIndex;
                }
                // Position is past this segment; advance to the next.
                _currentSegmentIndex++;
            }
            return ~_segments.Length; // Past all segments.
        }
 
        // Binary search over sorted segments in the range [lo, hi].
        // Returns the segment index if found, or ~insertionPoint if virtualPosition is in a hole.
        private int BinarySearchSegment(long virtualPosition, int lo, int hi)
        {
            Debug.Assert(_segments is not null);
            while (lo <= hi)
            {
                int mid = lo + (hi - lo) / 2;
                var (offset, length) = _segments[mid];
                if (virtualPosition < offset)
                {
                    hi = mid - 1;
                }
                else if (virtualPosition >= offset + length)
                {
                    lo = mid + 1;
                }
                else
                {
                    return mid;
                }
            }
            return ~lo;
        }
 
        // Parses the sparse map from rawStream (positioned at the start of the data section).
        // The map format is: numSegments\n, then pairs of offset\n numbytes\n.
        // After the map text, there is zero-padding to the next 512-byte block boundary,
        // and then the packed data begins.
        //
        // Returns the parsed segments.
        private static async Task<(long Offset, long Length)[]> ParseSparseMap(
            bool isAsync, Stream rawStream, CancellationToken cancellationToken)
        {
            // The buffer is 2 * RecordSize (1024 bytes) and each fill reads exactly RecordSize (512)
            // bytes. This guarantees that the total bytes read is always a multiple of RecordSize,
            // so the stream is already positioned at the start of the packed data when this method returns.
            int bufferSize = 2 * TarHelpers.RecordSize;
            byte[] bytes = ArrayPool<byte>.Shared.Rent(bufferSize);
 
            try
            {
                int activeStart = 0;
                int availableStart = 0;
 
                // Compact the buffer and read exactly one RecordSize (512) block.
                // Returns true if bytes were read, false on EOF.
                async ValueTask<bool> FillBufferAsync()
                {
                    int active = availableStart - activeStart;
                    if (active > 0 && activeStart > 0)
                    {
                        bytes.AsSpan(activeStart, active).CopyTo(bytes);
                    }
                    activeStart = 0;
                    availableStart = active;
 
                    int newBytes = isAsync
                        ? await rawStream.ReadAtLeastAsync(bytes.AsMemory(availableStart, TarHelpers.RecordSize), TarHelpers.RecordSize, throwOnEndOfStream: false, cancellationToken).ConfigureAwait(false)
                        : rawStream.ReadAtLeast(bytes.AsSpan(availableStart, TarHelpers.RecordSize), TarHelpers.RecordSize, throwOnEndOfStream: false);
 
                    availableStart += newBytes;
                    return newBytes > 0;
                }
 
                // Reads a newline-terminated decimal line from the buffer, refilling as needed.
                // Returns the parsed value. Throws InvalidDataException if the line is malformed.
                async ValueTask<long> ReadLineAsync()
                {
                    while (true)
                    {
                        int nlIdx = bytes.AsSpan(activeStart, availableStart - activeStart).IndexOf((byte)'\n');
                        if (nlIdx >= 0)
                        {
                            ReadOnlySpan<byte> span = bytes.AsSpan(activeStart, nlIdx);
                            if (!Utf8Parser.TryParse(span, out long value, out int consumed) || consumed != span.Length)
                            {
                                throw new InvalidDataException(SR.TarInvalidNumber);
                            }
                            activeStart += nlIdx + 1;
                            return value;
                        }
 
                        if (availableStart + TarHelpers.RecordSize > bufferSize)
                        {
                            // Not enough room in the buffer for another block-sized fill
                            // and no newline found: line is too long (malformed).
                            throw new InvalidDataException(SR.TarInvalidNumber);
                        }
 
                        if (!await FillBufferAsync().ConfigureAwait(false))
                        {
                            // EOF before newline.
                            throw new InvalidDataException(SR.TarInvalidNumber);
                        }
                    }
                }
 
                await FillBufferAsync().ConfigureAwait(false);
 
                long numSegments = await ReadLineAsync().ConfigureAwait(false);
                if ((ulong)numSegments > MaxSparseSegments)
                {
                    throw new InvalidDataException(SR.TarInvalidNumber);
                }
 
                var segments = new (long Offset, long Length)[(int)numSegments];
                for (int i = 0; i < (int)numSegments; i++)
                {
                    long offset = await ReadLineAsync().ConfigureAwait(false);
                    long length = await ReadLineAsync().ConfigureAwait(false);
                    if (offset < 0 || length < 0)
                    {
                        throw new InvalidDataException(SR.TarInvalidNumber);
                    }
                    segments[i] = (offset, length);
                }
 
                // Since each FillBuffer call reads exactly RecordSize (512) bytes, the total bytes
                // read is always a multiple of RecordSize (mapBytesConsumed + padding), so the stream
                // is already positioned at the start of the packed data.
                return segments;
            }
            finally
            {
                ArrayPool<byte>.Shared.Return(bytes);
            }
        }
 
        protected override void Dispose(bool disposing)
        {
            if (disposing && !_isDisposed)
            {
                _rawStream.Dispose();
            }
            _isDisposed = true;
            base.Dispose(disposing);
        }
 
        public override async ValueTask DisposeAsync()
        {
            if (!_isDisposed)
            {
                await _rawStream.DisposeAsync().ConfigureAwait(false);
            }
            _isDisposed = true;
            await base.DisposeAsync().ConfigureAwait(false);
        }
 
        public override void Flush() { }
 
        public override Task FlushAsync(CancellationToken cancellationToken) =>
            cancellationToken.IsCancellationRequested ? Task.FromCanceled(cancellationToken) : Task.CompletedTask;
 
        public override void SetLength(long value) => throw new NotSupportedException(SR.IO_NotSupported_UnwritableStream);
 
        public override void Write(byte[] buffer, int offset, int count) => throw new NotSupportedException(SR.IO_NotSupported_UnwritableStream);
 
        private void ThrowIfDisposed() => ObjectDisposedException.ThrowIf(_isDisposed, this);
    }
}