File: ColumnInference\TextFileSample.cs
Web Access
Project: src\src\Microsoft.ML.AutoML\Microsoft.ML.AutoML.csproj (Microsoft.ML.AutoML)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.Data;
 
namespace Microsoft.ML.AutoML
{
    /// <summary>
    /// This class holds an in-memory sample of the text file, and serves as an <see cref="IMultiStreamSource"/> proxy to it.
    /// </summary>
    internal sealed class TextFileSample : IMultiStreamSource
    {
        // REVIEW: consider including multiple files via IMultiStreamSource.
 
        // REVIEW: right now, it expects 0x0A being the trailing character of line break.
        // Consider a more general implementation.
 
        private const int BufferSizeMb = 4;
        private const int FirstChunkSizeMb = 1;
        private const int LinesPerChunk = 20;
        private const Double OversamplingRate = 1.1;
 
        private readonly byte[] _buffer;
        private readonly long? _fullFileSize;
        private readonly long? _approximateRowCount;
 
        private TextFileSample(byte[] buffer, long? fullFileSize, long? lineCount)
        {
            _buffer = buffer;
            _fullFileSize = fullFileSize;
            _approximateRowCount = lineCount;
        }
 
        public int Count
        {
            get { return 1; }
        }
 
        // Full file size, if known, otherwise, null.
        public long? FullFileSize
        {
            get { return _fullFileSize; }
        }
 
        public int SampleSize
        {
            get { return _buffer.Length; }
        }
 
        public string GetPathOrNull(int index)
        {
            //Contracts.Check(index == 0, "Index must be 0");
            return null;
        }
 
        public Stream Open(int index)
        {
            //Contracts.Check(index == 0, "Index must be 0");
            return new MemoryStream(_buffer);
        }
 
        public TextReader OpenTextReader(int index)
        {
            return new StreamReader(Open(index));
        }
 
        public long? ApproximateRowCount => _approximateRowCount;
 
        public static TextFileSample CreateFromFullFile(string path)
        {
            using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                return CreateFromFullStream(fs);
            }
        }
 
        /// <summary>
        /// Create a <see cref="TextFileSample"/> by reading multiple chunks from the file (or other source) and
        /// then stitching them together. The algorithm is as follows:
        /// 0. If the source is not seekable, revert to <see cref="CreateFromHead"/>.
        /// 1. If the file length is less than 2 * <see cref="BufferSizeMb"/>, revert to <see cref="CreateFromHead"/>.
        /// 2. Read first <see cref="FirstChunkSizeMb"/> MB chunk. Determine average line length in the chunk.
        /// 3. Determine how large one chunk should be, and how many chunks there should be, to end up
        /// with <see cref="BufferSizeMb"/> * <see cref="OversamplingRate"/> MB worth of lines.
        /// 4. Determine seek locations and read the chunks.
        /// 5. Stitch and return a <see cref="TextFileSample"/>.
        /// </summary>
        public static TextFileSample CreateFromFullStream(Stream stream)
        {
            if (!stream.CanSeek)
            {
                return CreateFromHead(stream);
            }
            var fileSize = stream.Length;
 
            if (fileSize <= 2 * BufferSizeMb * (1 << 20))
            {
                return CreateFromHead(stream);
            }
 
            var firstChunk = new byte[FirstChunkSizeMb * (1 << 20)];
            int count = stream.Read(firstChunk, 0, firstChunk.Length);
            if (!IsEncodingOkForSampling(firstChunk))
                return CreateFromHead(stream);
            // REVIEW: CreateFromHead still truncates the file before the last 0x0A byte. For multi-byte encoding,
            // this might cause an unfinished string to be present in the buffer. Right now this is considered an acceptable
            // price to pay for parse-free processing.
 
            var lineCount = firstChunk.Count(x => x == '\n');
            if (lineCount == 0)
            {
                throw new ArgumentException("Couldn't identify line breaks. Provided file is not text?");
            }
 
            long approximateRowCount = (long)(lineCount * fileSize * 1.0 / firstChunk.Length);
            var firstNewline = Array.FindIndex(firstChunk, x => x == '\n');
 
            // First line may be header, so we exclude it. The remaining lineCount-1 line breaks are
            // splitting the text into lineCount lines, and the last line is actually half-size.
            Double averageLineLength = 2.0 * (firstChunk.Length - firstNewline) / (lineCount * 2 - 1);
            averageLineLength = Math.Max(averageLineLength, 3);
 
            int usefulChunkSize = (int)(averageLineLength * LinesPerChunk);
            int chunkSize = (int)(usefulChunkSize + averageLineLength); // assuming that 1 line worth will be trimmed out
 
            int chunkCount = (int)Math.Ceiling((BufferSizeMb * OversamplingRate - FirstChunkSizeMb) * (1 << 20) / usefulChunkSize);
            int maxChunkCount = (int)Math.Floor((double)(fileSize - firstChunk.Length) / chunkSize);
            chunkCount = Math.Min(chunkCount, maxChunkCount);
 
            var chunks = new List<byte[]>();
            chunks.Add(firstChunk);
 
            // determine the start of each remaining chunk
            long fileSizeRemaining = fileSize - firstChunk.Length - ((long)chunkSize) * chunkCount;
 
            var chunkStartIndices = Enumerable.Range(0, chunkCount)
                .Select(x => AutoMlUtils.Random.Value.NextDouble() * fileSizeRemaining)
                .OrderBy(x => x)
                .Select((spot, i) => (long)(spot + firstChunk.Length + i * chunkSize))
                .ToArray();
 
            foreach (var chunkStartIndex in chunkStartIndices)
            {
                stream.Seek(chunkStartIndex, SeekOrigin.Begin);
                byte[] chunk = new byte[chunkSize];
                int readCount = stream.Read(chunk, 0, chunkSize);
                Array.Resize(ref chunk, chunkSize);
                chunks.Add(chunk);
            }
 
            return new TextFileSample(StitchChunks(false, chunks.ToArray()), fileSize, approximateRowCount);
        }
 
        /// <summary>
        /// Create a <see cref="TextFileSample"/> by reading one chunk from the beginning.
        /// </summary>
        private static TextFileSample CreateFromHead(Stream stream)
        {
            var buf = new byte[BufferSizeMb * (1 << 20)];
            int readCount = stream.Read(buf, 0, buf.Length);
            Array.Resize(ref buf, readCount);
            long? multiplier = stream.CanSeek ? (int?)(stream.Length / buf.Length) : null;
            return new TextFileSample(StitchChunks(readCount == stream.Length, buf),
                stream.CanSeek ? (long?)stream.Length : null,
                multiplier.HasValue ? buf.Count(x => x == '\n') * multiplier : null);
        }
 
        /// <summary>
        /// Given an array of chunks of the text file, of which the first chunk is the head,
        /// this method trims incomplete lines from the beginning and end of each chunk
        /// (except that it doesn't trim the beginning of the first chunk and end of last chunk if we read whole file),
        /// then joins the rest together to form a final byte buffer and returns a <see cref="TextFileSample"/>
        /// wrapped around it.
        /// </summary>
        /// <param name="wholeFile">did we read whole file</param>
        /// <param name="chunks">chunks of data</param>
        /// <returns></returns>
        private static byte[] StitchChunks(bool wholeFile, params byte[][] chunks)
        {
            using (var resultStream = new MemoryStream(BufferSizeMb * (1 << 20)))
            {
                for (int i = 0; i < chunks.Length; i++)
                {
                    int iMin = (i == 0) ? 0 : Array.FindIndex(chunks[i], x => x == '\n') + 1;
                    int iLim = (wholeFile && i == chunks.Length - 1)
                        ? chunks[i].Length
                        : Array.FindLastIndex(chunks[i], x => x == '\n') + 1;
 
                    if (iLim == 0)
                    {
                        //entire buffer is one string, skip
                        continue;
                    }
 
                    resultStream.Write(chunks[i], iMin, iLim - iMin);
                }
 
                var resultBuffer = resultStream.ToArray();
                if (resultBuffer.Length == 0)
                {
                    throw new ArgumentException("File is not text, or couldn't detect line breaks");
                }
 
                return resultBuffer;
            }
        }
 
        /// <summary>
        /// Detect whether we can auto-detect EOL characters without parsing.
        /// If we do, we can cheaply sample from different file locations and trim the partial strings.
        /// The encodings that pass the test are UTF8 and all single-byte encodings.
        /// </summary>
        private static bool IsEncodingOkForSampling(byte[] buffer)
        {
            // First check if a BOM/signature exists (sourced from https://www.unicode.org/faq/utf_bom.html#bom4)
            if (buffer.Length >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF)
            {
                // UTF-32, big-endian
                return false;
            }
            if (buffer.Length >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00)
            {
                // UTF-32, little-endian
                return false;
            }
            if (buffer.Length >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF)
            {
                // UTF-16, big-endian
                return false;
            }
            if (buffer.Length >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE)
            {
                // UTF-16, little-endian
                return false;
            }
            if (buffer.Length >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF)
            {
                // UTF-8
                return true;
            }
            if (buffer.Length >= 3 && buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
            {
                // UTF-7
                return true;
            }
 
            // No BOM/signature was found, so now we need to 'sniff' the file to see if can manually discover the encoding.
            int sniffLim = Math.Min(1000, buffer.Length);
 
            // Some text files are encoded in UTF8, but have no BOM/signature. Hence the below manually checks for a UTF8 pattern. This code is based off
            // the top answer at: https://stackoverflow.com/questions/6555015/check-for-invalid-utf8 .
            int i = 0;
            bool utf8 = false;
            while (i < sniffLim - 4)
            {
                if (buffer[i] <= 0x7F)
                {
                    i += 1;
                    continue;
                }
                if (buffer[i] >= 0xC2 && buffer[i] <= 0xDF && buffer[i + 1] >= 0x80 && buffer[i + 1] < 0xC0)
                {
                    i += 2;
                    utf8 = true;
                    continue;
                }
                if (buffer[i] >= 0xE0 && buffer[i] <= 0xF0 && buffer[i + 1] >= 0x80 && buffer[i + 1] < 0xC0 &&
                    buffer[i + 2] >= 0x80 && buffer[i + 2] < 0xC0)
                {
                    i += 3;
                    utf8 = true;
                    continue;
                }
                if (buffer[i] >= 0xF0 && buffer[i] <= 0xF4 && buffer[i + 1] >= 0x80 && buffer[i + 1] < 0xC0 &&
                    buffer[i + 2] >= 0x80 && buffer[i + 2] < 0xC0 && buffer[i + 3] >= 0x80 && buffer[i + 3] < 0xC0)
                {
                    i += 4;
                    utf8 = true;
                    continue;
                }
                utf8 = false;
                break;
            }
            if (utf8)
            {
                return true;
            }
 
            if (buffer.Take(sniffLim).Any(x => x == 0))
            {
                // likely a UTF-16 or UTF-32 without a BOM.
                return false;
            }
 
            // If all else failed, the file is likely in a local 1-byte encoding.
            return true;
        }
    }
}