File: MarkItDownReader.cs
Web Access
Project: src\src\Libraries\Microsoft.Extensions.DataIngestion.MarkItDown\Microsoft.Extensions.DataIngestion.MarkItDown.csproj (Microsoft.Extensions.DataIngestion.MarkItDown)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;
 
namespace Microsoft.Extensions.DataIngestion;
 
/// <summary>
/// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> tool.
/// </summary>
public class MarkItDownReader : IngestionDocumentReader
{
    private readonly FileInfo? _exePath;
    private readonly bool _extractImages;
 
    /// <summary>
    /// Initializes a new instance of the <see cref="MarkItDownReader"/> class.
    /// </summary>
    /// <param name="exePath">The path to the MarkItDown executable. When not provided, "markitdown" needs to be added to PATH.</param>
    /// <param name="extractImages">A value indicating whether to extract images.</param>
    public MarkItDownReader(FileInfo? exePath = null, bool extractImages = false)
    {
        _exePath = exePath;
        _extractImages = extractImages;
    }
 
    /// <inheritdoc/>
    public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
    {
        _ = Throw.IfNull(source);
        _ = Throw.IfNullOrEmpty(identifier);
 
        if (!source.Exists)
        {
            throw new FileNotFoundException("The specified file does not exist.", source.FullName);
        }
 
        // Manually set ProcessStartInfo.WorkingDirectory to a "safe location":
        // - If exePath is provided, use its directory.
        // - Otherwise, use AppContext.BaseDirectory (the directory of the running application).
        string workingDirectory = _exePath?.Directory?.FullName ?? AppContext.BaseDirectory;
 
        ProcessStartInfo startInfo = new()
        {
            FileName = _exePath?.FullName ?? "markitdown",
            WorkingDirectory = workingDirectory,
            UseShellExecute = false,
            CreateNoWindow = true,
            RedirectStandardOutput = true,
            StandardOutputEncoding = Encoding.UTF8,
        };
 
        // Force UTF-8 encoding in the environment (will produce garbage otherwise).
        startInfo.Environment["PYTHONIOENCODING"] = "utf-8";
        startInfo.Environment["LC_ALL"] = "C.UTF-8";
        startInfo.Environment["LANG"] = "C.UTF-8";
 
#if NET
        startInfo.ArgumentList.Add(source.FullName);
        if (_extractImages)
        {
            startInfo.ArgumentList.Add("--keep-data-uris");
        }
#else
        startInfo.Arguments = $"\"{source.FullName}\"" + (_extractImages ? " --keep-data-uris" : string.Empty);
#endif
 
        string outputContent = string.Empty;
        using (Process process = new() { StartInfo = startInfo })
        {
            process.Start();
 
            outputContent = await process.StandardOutput.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
#if NET
            await process.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
#else
            process.WaitForExit();
#endif
 
            if (process.ExitCode != 0)
            {
                throw new InvalidOperationException($"MarkItDown process failed with exit code {process.ExitCode}.");
            }
        }
 
        return MarkdownParser.Parse(outputContent, identifier);
    }
 
    /// <inheritdoc/>
    /// <remarks>The contents of <paramref name="source"/> are copied to a temporary file.</remarks>
    public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
    {
        _ = Throw.IfNull(source);
        _ = Throw.IfNullOrEmpty(identifier);
 
        // Instead of creating a temporary file, we could write to the StandardInput of the process.
        // MarkItDown says it supports reading from stdin, but it does not work as expected.
        // Even the sample command line does not work with stdin: "cat example.pdf | markitdown"
        // I can be doing something wrong, but for now, let's write to a temporary file.
        string inputFilePath = Path.Combine(Path.GetTempPath(), Path.GetRandomFileName());
        FileStream inputFile = new(inputFilePath, FileMode.CreateNew, FileAccess.Write, FileShare.None, bufferSize: 1, FileOptions.Asynchronous);
 
        try
        {
            await source
#if NET
                .CopyToAsync(inputFile, cancellationToken)
#else
                .CopyToAsync(inputFile)
#endif
                .ConfigureAwait(false);
 
            inputFile.Close();
 
            return await ReadAsync(new FileInfo(inputFilePath), identifier, mediaType, cancellationToken).ConfigureAwait(false);
        }
        finally
        {
#if NET
            await inputFile.DisposeAsync().ConfigureAwait(false);
#else
            inputFile.Dispose();
#endif
            File.Delete(inputFilePath);
        }
    }
}