File: IngestionDocumentReader.cs
Web Access
Project: src\src\Libraries\Microsoft.Extensions.DataIngestion.Abstractions\Microsoft.Extensions.DataIngestion.Abstractions.csproj (Microsoft.Extensions.DataIngestion.Abstractions)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;
 
namespace Microsoft.Extensions.DataIngestion;
 
/// <summary>
/// Reads source content and converts it to an <see cref="IngestionDocument"/>.
/// </summary>
public abstract class IngestionDocumentReader
{
    /// <summary>
    /// Reads a file and converts it to an <see cref="IngestionDocument"/>.
    /// </summary>
    /// <param name="source">The file to read.</param>
    /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
    /// <returns>A task representing the asynchronous read operation.</returns>
    /// <exception cref="ArgumentNullException"><paramref name="source"/> is <see langword="null"/>.</exception>
    public Task<IngestionDocument> ReadAsync(FileInfo source, CancellationToken cancellationToken = default)
    {
        string identifier = Throw.IfNull(source).FullName; // entire path is more unique than just part of it.
        return ReadAsync(source, identifier, GetMediaType(source), cancellationToken);
    }
 
    /// <summary>
    /// Reads a file and converts it to an <see cref="IngestionDocument"/>.
    /// </summary>
    /// <param name="source">The file to read.</param>
    /// <param name="identifier">The unique identifier for the document.</param>
    /// <param name="mediaType">The media type of the file.</param>
    /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
    /// <returns>A task representing the asynchronous read operation.</returns>
    /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="identifier"/> is <see langword="null"/> or empty.</exception>
    public virtual async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
    {
        _ = Throw.IfNull(source);
        _ = Throw.IfNullOrEmpty(identifier);
 
        using FileStream stream = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize: 1, FileOptions.Asynchronous);
        return await ReadAsync(stream, identifier, string.IsNullOrEmpty(mediaType) ? GetMediaType(source) : mediaType!, cancellationToken).ConfigureAwait(false);
    }
 
    /// <summary>
    /// Reads a stream and converts it to an <see cref="IngestionDocument"/>.
    /// </summary>
    /// <param name="source">The stream to read.</param>
    /// <param name="identifier">The unique identifier for the document.</param>
    /// <param name="mediaType">The media type of the content.</param>
    /// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
    /// <returns>A task representing the asynchronous read operation.</returns>
    public abstract Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default);
 
    private static string GetMediaType(FileInfo source)
        => source.Extension switch
        {
            ".123" => "application/vnd.lotus-1-2-3",
            ".602" => "application/x-t602",
            ".abw" => "application/x-abiword",
            ".bmp" => "image/bmp",
            ".cgm" => "image/cgm",
            ".csv" => "text/csv",
            ".cwk" => "application/x-cwk",
            ".dbf" => "application/vnd.dbf",
            ".dif" => "application/x-dif",
            ".doc" => "application/msword",
            ".docm" => "application/vnd.ms-word.document.macroEnabled.12",
            ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            ".dot" => "application/msword",
            ".dotm" => "application/vnd.ms-word.template.macroEnabled.12",
            ".dotx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
            ".epub" => "application/epub+zip",
            ".et" => "application/vnd.ms-excel",
            ".eth" => "application/ethos",
            ".fods" => "application/vnd.oasis.opendocument.spreadsheet",
            ".gif" => "image/gif",
            ".htm" => "text/html",
            ".html" => "text/html",
            ".hwp" => "application/x-hwp",
            ".jpeg" => "image/jpeg",
            ".jpg" => "image/jpeg",
            ".key" => "application/x-iwork-keynote-sffkey",
            ".lwp" => "application/vnd.lotus-wordpro",
            ".mcw" => "application/macwriteii",
            ".mw" => "application/macwriteii",
            ".numbers" => "application/x-iwork-numbers-sffnumbers",
            ".ods" => "application/vnd.oasis.opendocument.spreadsheet",
            ".pages" => "application/x-iwork-pages-sffpages",
            ".pbd" => "application/x-pagemaker",
            ".pdf" => "application/pdf",
            ".png" => "image/png",
            ".pot" => "application/vnd.ms-powerpoint",
            ".potm" => "application/vnd.ms-powerpoint.template.macroEnabled.12",
            ".potx" => "application/vnd.openxmlformats-officedocument.presentationml.template",
            ".ppt" => "application/vnd.ms-powerpoint",
            ".pptm" => "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
            ".pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
            ".prn" => "application/x-prn",
            ".qpw" => "application/x-quattro-pro",
            ".rtf" => "application/rtf",
            ".sda" => "application/vnd.stardivision.draw",
            ".sdd" => "application/vnd.stardivision.impress",
            ".sdp" => "application/sdp",
            ".sdw" => "application/vnd.stardivision.writer",
            ".sgl" => "application/vnd.stardivision.writer",
            ".slk" => "text/vnd.sylk",
            ".sti" => "application/vnd.sun.xml.impress.template",
            ".stw" => "application/vnd.sun.xml.writer.template",
            ".svg" => "image/svg+xml",
            ".sxg" => "application/vnd.sun.xml.writer.global",
            ".sxi" => "application/vnd.sun.xml.impress",
            ".sxw" => "application/vnd.sun.xml.writer",
            ".sylk" => "text/vnd.sylk",
            ".tiff" => "image/tiff",
            ".tsv" => "text/tab-separated-values",
            ".txt" => "text/plain",
            ".uof" => "application/vnd.uoml+xml",
            ".uop" => "application/vnd.openofficeorg.presentation",
            ".uos1" => "application/vnd.uoml+xml",
            ".uos2" => "application/vnd.uoml+xml",
            ".uot" => "application/x-uo",
            ".vor" => "application/vnd.stardivision.writer",
            ".webp" => "image/webp",
            ".wpd" => "application/wordperfect",
            ".wps" => "application/vnd.ms-works",
            ".wq1" => "application/x-lotus",
            ".wq2" => "application/x-lotus",
            ".xls" => "application/vnd.ms-excel",
            ".xlsb" => "application/vnd.ms-excel.sheet.binary.macroEnabled.12",
            ".xlsm" => "application/vnd.ms-excel.sheet.macroEnabled.12",
            ".xlr" => "application/vnd.ms-works",
            ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            ".xlw" => "application/vnd.ms-excel",
            ".xml" => "application/xml",
            ".zabw" => "application/x-abiword",
            _ => "application/octet-stream"
        };
}