LargeText.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Immutable;
using System.IO;
using System.Text;
using System.Threading;
using Microsoft.CodeAnalysis.PooledObjects;
using Roslyn.Utilities;
using System.Diagnostics;
using Microsoft.CodeAnalysis.Collections;
 
namespace Microsoft.CodeAnalysis.Text
{
    /// <summary>
    /// A <see cref="SourceText"/> optimized for very large sources. The text is stored as
    /// a list of chunks (char arrays).
    /// </summary>
    internal sealed class LargeText : SourceText
    {
        /// <remarks>
        /// internal for unit testing
        /// </remarks>
        internal const int ChunkSize = SourceText.LargeObjectHeapLimitInChars; // 40K Unicode chars is 80KB which is less than the large object heap limit.
 
        private readonly ImmutableArray<char[]> _chunks;
        private readonly int[] _chunkStartOffsets;
        private readonly int _length;
        private readonly Encoding? _encodingOpt;
 
        internal LargeText(ImmutableArray<char[]> chunks, Encoding? encodingOpt, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob)
            : base(checksum, checksumAlgorithm, embeddedTextBlob)
        {
            _chunks = chunks;
            _encodingOpt = encodingOpt;
            _chunkStartOffsets = new int[chunks.Length];
 
            int offset = 0;
            for (int i = 0; i < chunks.Length; i++)
            {
                _chunkStartOffsets[i] = offset;
                offset += chunks[i].Length;
            }
 
            _length = offset;
        }
 
        internal LargeText(ImmutableArray<char[]> chunks, Encoding? encodingOpt, SourceHashAlgorithm checksumAlgorithm)
            : this(chunks, encodingOpt, default(ImmutableArray<byte>), checksumAlgorithm, default(ImmutableArray<byte>))
        {
        }
 
        internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected, bool canBeEmbedded)
        {
            stream.Seek(0, SeekOrigin.Begin);
 
            long longLength = stream.Length;
            if (longLength == 0)
            {
                return SourceText.From(string.Empty, encoding, checksumAlgorithm);
            }
 
            var maxCharRemainingGuess = GetMaxCharCountOrThrowIfHuge(encoding, stream);
            Debug.Assert(longLength > 0 && longLength <= int.MaxValue); // GetMaxCharCountOrThrowIfHuge should have thrown.
            int length = (int)longLength;
 
            using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(length, 4096), leaveOpen: true))
            {
                var chunks = ReadChunksFromTextReader(reader, maxCharRemainingGuess, throwIfBinaryDetected);
 
                // We must compute the checksum and embedded text blob now while we still have the original bytes in hand.
                // We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip.
                var checksum = CalculateChecksum(stream, checksumAlgorithm);
                var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>);
                return new LargeText(chunks, reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob);
            }
        }
 
        internal static SourceText Decode(TextReader reader, int length, Encoding? encodingOpt, SourceHashAlgorithm checksumAlgorithm)
        {
            if (length == 0)
            {
                return SourceText.From(string.Empty, encodingOpt, checksumAlgorithm);
            }
 
            // throwIfBinaryDetected == false since we are given text reader from the beginning
            var chunks = ReadChunksFromTextReader(reader, length, throwIfBinaryDetected: false);
 
            return new LargeText(chunks, encodingOpt, checksumAlgorithm);
        }
 
        private static ImmutableArray<char[]> ReadChunksFromTextReader(TextReader reader, int maxCharRemainingGuess, bool throwIfBinaryDetected)
        {
            var chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize);
 
            while (reader.Peek() != -1)
            {
                var nextChunkSize = ChunkSize;
                if (maxCharRemainingGuess < ChunkSize)
                {
                    // maxCharRemainingGuess typically overestimates a little
                    // so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk
                    // and then use 64 char tail, which is likely to be resized.
                    nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64);
                }
 
                char[] chunk = new char[nextChunkSize];
 
                int charsRead = reader.ReadBlock(chunk, 0, chunk.Length);
                if (charsRead == 0)
                {
                    break;
                }
 
                maxCharRemainingGuess -= charsRead;
 
                if (charsRead < chunk.Length)
                {
                    Array.Resize(ref chunk, charsRead);
                }
 
                // Check for binary files
                if (throwIfBinaryDetected && IsBinary(chunk))
                {
                    throw new InvalidDataException();
                }
 
                chunks.Add(chunk);
            }
 
            return chunks.ToImmutableAndFree();
        }
 
        private int GetIndexFromPosition(int position)
        {
            // Binary search to find the chunk that contains the given position.
            int idx = _chunkStartOffsets.BinarySearch(position);
            return idx >= 0 ? idx : (~idx - 1);
        }
 
        public override char this[int position]
        {
            get
            {
                int i = GetIndexFromPosition(position);
                return _chunks[i][position - _chunkStartOffsets[i]];
            }
        }
 
        public override Encoding? Encoding => _encodingOpt;
 
        public override int Length => _length;
 
        public override void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count)
        {
            if (count == 0)
            {
                return;
            }
 
            int chunkIndex = GetIndexFromPosition(sourceIndex);
            int chunkStartOffset = sourceIndex - _chunkStartOffsets[chunkIndex];
            while (true)
            {
                var chunk = _chunks[chunkIndex];
                int charsToCopy = Math.Min(chunk.Length - chunkStartOffset, count);
                Array.Copy(chunk, chunkStartOffset, destination, destinationIndex, charsToCopy);
                count -= charsToCopy;
                if (count <= 0)
                {
                    break;
                }
 
                destinationIndex += charsToCopy;
                chunkStartOffset = 0;
                chunkIndex++;
            }
        }
 
        public override void Write(TextWriter writer, TextSpan span, CancellationToken cancellationToken = default(CancellationToken))
        {
            if (span.Start < 0 || span.Start > _length || span.End > _length)
            {
                throw new ArgumentOutOfRangeException(nameof(span));
            }
 
            int count = span.Length;
            if (count == 0)
            {
                return;
            }
 
            var chunkWriter = writer as LargeTextWriter;
 
            int chunkIndex = GetIndexFromPosition(span.Start);
            int chunkStartOffset = span.Start - _chunkStartOffsets[chunkIndex];
            while (true)
            {
                cancellationToken.ThrowIfCancellationRequested();
                var chunk = _chunks[chunkIndex];
                int charsToWrite = Math.Min(chunk.Length - chunkStartOffset, count);
 
                if (chunkWriter != null && chunkStartOffset == 0 && charsToWrite == chunk.Length)
                {
                    // reuse entire chunk
                    chunkWriter.AppendChunk(chunk);
                }
                else
                {
                    writer.Write(chunk, chunkStartOffset, charsToWrite);
                }
 
                count -= charsToWrite;
                if (count <= 0)
                {
                    break;
                }
 
                chunkStartOffset = 0;
                chunkIndex++;
            }
        }
 
        /// <summary>
        /// Called from <see cref="SourceText.Lines"/> to initialize the <see cref="TextLineCollection"/>. Thereafter,
        /// the collection is cached.
        /// </summary>
        /// <returns>A new <see cref="TextLineCollection"/> representing the individual text lines.</returns>
        protected override TextLineCollection GetLinesCore()
        {
            return new LineInfo(this, ParseLineStarts());
        }
 
        private SegmentedList<int> ParseLineStarts()
        {
            var position = 0;
            var index = 0;
            var lastCr = -1;
            // Initial line capacity estimated at 64 chars / line. This value was obtained by
            // looking at ratios in large files in the roslyn repo.
            var list = new SegmentedList<int>(Length / 64);
 
            // The following loop goes through every character in the text. It is highly
            // performance critical, and thus inlines knowledge about common line breaks
            // and non-line breaks.
            foreach (var chunk in _chunks)
            {
                foreach (var c in chunk)
                {
                    index++;
 
                    // Common case - ASCII & not a line break
                    const uint bias = '\r' + 1;
                    if (unchecked(c - bias) <= (127 - bias))
                    {
                        continue;
                    }
 
                    switch (c)
                    {
                        case '\r':
                            lastCr = index;
                            goto line_break;
 
                        case '\n':
                            // Assumes that the only 2-char line break sequence is CR+LF
                            if (lastCr == (index - 1))
                            {
                                position = index;
                                break;
                            }
 
                            goto line_break;
 
                        case '\u0085':
                        case '\u2028':
                        case '\u2029':
line_break:
                            list.Add(position);
                            position = index;
                            break;
                    }
                }
            }
 
            // Create a start for the final line.  
            list.Add(position);
            return list;
        }
    }
}
File: Text\LargeText.cs	Web Access
Project: src\src\Compilers\Core\Portable\Microsoft.CodeAnalysis.csproj (Microsoft.CodeAnalysis)