File: IngestionDocumentElement.cs
Web Access
Project: src\src\Libraries\Microsoft.Extensions.DataIngestion.Abstractions\Microsoft.Extensions.DataIngestion.Abstractions.csproj (Microsoft.Extensions.DataIngestion.Abstractions)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Microsoft.Shared.Diagnostics;
 
namespace Microsoft.Extensions.DataIngestion;
 
#pragma warning disable SA1402 // File may only contain a single type
 
/// <summary>
/// Represents an element within an <see cref="IngestionDocument"/>.
/// </summary>
[DebuggerDisplay("Type = {GetType().Name}, Markdown = {GetMarkdown()}")]
public abstract class IngestionDocumentElement
{
#pragma warning disable IDE1006 // Naming Styles
    private protected string _markdown;
#pragma warning restore IDE1006 // Naming Styles
 
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentElement"/> class.
    /// </summary>
    /// <param name="markdown">The markdown representation of the element.</param>
    /// <exception cref="ArgumentNullException"><paramref name="markdown"/> is <see langword="null"/> or empty.</exception>
    private protected IngestionDocumentElement(string markdown)
    {
        _markdown = string.IsNullOrEmpty(markdown) ? throw new ArgumentNullException(nameof(markdown)) : markdown;
    }
 
    private protected IngestionDocumentElement()
    {
        _markdown = null!;
    }
 
    private Dictionary<string, object?>? _metadata;
 
    /// <summary>
    /// Gets or sets the textual content of the element.
    /// </summary>
    public string? Text { get; set; }
 
    /// <summary>
    /// Gets the markdown representation of the element.
    /// </summary>
    /// <returns>The markdown representation.</returns>
    public virtual string GetMarkdown() => _markdown;
 
    /// <summary>
    /// Gets or sets the page number where this element appears.
    /// </summary>
    public int? PageNumber { get; set; }
 
    /// <summary>
    /// Gets a value indicating whether this element has metadata.
    /// </summary>
    public bool HasMetadata => _metadata?.Count > 0;
 
    /// <summary>
    /// Gets the metadata associated with this element.
    /// </summary>
    public IDictionary<string, object?> Metadata => _metadata ??= [];
}
 
/// <summary>
/// A section can be just a page or a logical grouping of elements in a document.
/// </summary>
public sealed class IngestionDocumentSection : IngestionDocumentElement
{
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentSection"/> class.
    /// </summary>
    /// <param name="markdown">The markdown representation of the section.</param>
    public IngestionDocumentSection(string markdown)
        : base(markdown)
    {
    }
 
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentSection"/> class.
    /// </summary>
    public IngestionDocumentSection()
    {
    }
 
    /// <summary>
    /// Gets the elements within this section.
    /// </summary>
    public IList<IngestionDocumentElement> Elements { get; } = [];
 
    /// <inheritdoc/>
    public override string GetMarkdown()
        => string.Join(Environment.NewLine, Elements.Select(e => e.GetMarkdown()));
}
 
/// <summary>
/// Represents a paragraph in a document.
/// </summary>
public sealed class IngestionDocumentParagraph : IngestionDocumentElement
{
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentParagraph"/> class.
    /// </summary>
    /// <param name="markdown">The markdown representation of the paragraph.</param>
    public IngestionDocumentParagraph(string markdown)
        : base(markdown)
    {
    }
}
 
/// <summary>
/// Represents a header in a document.
/// </summary>
public sealed class IngestionDocumentHeader : IngestionDocumentElement
{
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentHeader"/> class.
    /// </summary>
    /// <param name="markdown">The markdown representation of the header.</param>
    public IngestionDocumentHeader(string markdown)
        : base(markdown)
    {
    }
 
    /// <summary>
    /// Gets or sets the level of the header.
    /// </summary>
    public int? Level { get; set; }
}
 
/// <summary>
/// Represents a footer in a document.
/// </summary>
public sealed class IngestionDocumentFooter : IngestionDocumentElement
{
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentFooter"/> class.
    /// </summary>
    /// <param name="markdown">The markdown representation of the footer.</param>
    public IngestionDocumentFooter(string markdown)
        : base(markdown)
    {
    }
}
 
/// <summary>
/// Represents a table in a document.
/// </summary>
public sealed class IngestionDocumentTable : IngestionDocumentElement
{
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentTable"/> class.
    /// </summary>
    /// <param name="markdown">The markdown representation of the table.</param>
    /// <param name="cells">The cells of the table.</param>
    /// <exception cref="ArgumentNullException"><paramref name="cells"/> is <see langword="null"/>.</exception>
#pragma warning disable CA1814 // Prefer jagged arrays over multidimensional
#pragma warning disable S3967 // Multidimensional arrays should not be used
    public IngestionDocumentTable(string markdown, IngestionDocumentElement?[,] cells)
        : base(markdown)
    {
        Cells = Throw.IfNull(cells);
    }
 
    /// <summary>
    /// Gets the cells of the table.
    /// Each table can be represented as a two-dimensional array of cell contents, with the first row being the headers.
    /// </summary>
    /// <remarks>
    /// <para>This information is useful when chunking large tables that exceed token count limit.</para>
    /// <para>Null represents an empty cell (<see cref="IngestionDocumentElement.GetMarkdown()"/> can't return an empty string).</para>
    /// </remarks>
#pragma warning disable CA1819 // Properties should not return arrays
    public IngestionDocumentElement?[,] Cells { get; }
#pragma warning restore CA1819 // Properties should not return arrays
#pragma warning restore S3967 // Multidimensional arrays should not be used
#pragma warning restore CA1814 // Prefer jagged arrays over multidimensional
}
 
/// <summary>
/// Represents an image in a document.
/// </summary>
public sealed class IngestionDocumentImage : IngestionDocumentElement
{
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionDocumentImage"/> class.
    /// </summary>
    /// <param name="markdown">The markdown representation of the image.</param>
    public IngestionDocumentImage(string markdown)
        : base(markdown)
    {
    }
 
    /// <summary>
    /// Gets or sets the binary content of the image.
    /// </summary>
    public ReadOnlyMemory<byte>? Content { get; set; }
 
    /// <summary>
    /// Gets or sets the media type of the image.
    /// </summary>
    public string? MediaType { get; set; }
 
    /// <summary>
    /// Gets or sets the alternative text for the image.
    /// </summary>
    /// <remarks>
    /// Alternative text is a brief, descriptive text that explains the content, context, or function of an image when the image cannot be displayed or accessed.
    /// This property can be used when generating the embedding for the image that is part of larger chunk.
    /// </remarks>
    public string? AlternativeText { get; set; }
}
 
#pragma warning restore SA1402 // File may only contain a single type