File: Chunkers\SectionChunker.cs
Web Access
Project: src\src\Libraries\Microsoft.Extensions.DataIngestion\Microsoft.Extensions.DataIngestion.csproj (Microsoft.Extensions.DataIngestion)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Threading;
using Microsoft.Shared.Diagnostics;
 
namespace Microsoft.Extensions.DataIngestion.Chunkers;
 
/// <summary>
/// Treats each <see cref="IngestionDocumentSection" /> in a <see cref="IngestionDocument.Sections"/> as a separate entity.
/// </summary>
public sealed class SectionChunker : IngestionChunker<string>
{
    private readonly ElementsChunker _elementsChunker;
 
    /// <summary>
    /// Initializes a new instance of the <see cref="SectionChunker"/> class.
    /// </summary>
    /// <param name="options">The options for the chunker.</param>
    public SectionChunker(IngestionChunkerOptions options)
    {
        _elementsChunker = new(options);
    }
 
    /// <inheritdoc/>
    public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document, [EnumeratorCancellation] CancellationToken cancellationToken = default)
    {
        _ = Throw.IfNull(document);
 
        List<IngestionChunk<string>> chunks = [];
        foreach (IngestionDocumentSection section in document.Sections)
        {
            cancellationToken.ThrowIfCancellationRequested();
 
            Process(document, section, chunks);
            foreach (var chunk in chunks)
            {
                yield return chunk;
            }
            chunks.Clear();
        }
    }
 
    private void Process(IngestionDocument document, IngestionDocumentSection section, List<IngestionChunk<string>> chunks, string? parentContext = null)
    {
        List<IngestionDocumentElement> elements = new(section.Elements.Count);
        string context = parentContext ?? string.Empty;
 
        for (int i = 0; i < section.Elements.Count; i++)
        {
            switch (section.Elements[i])
            {
                // If the first element is a header, we use it as a context.
                // This is common for various documents and readers.
                case IngestionDocumentHeader documentHeader when i == 0:
                    context = string.IsNullOrEmpty(context)
                        ? documentHeader.GetMarkdown()
                        : context + $" {documentHeader.GetMarkdown()}";
                    break;
                case IngestionDocumentSection nestedSection:
                    Commit();
                    Process(document, nestedSection, chunks, context);
                    break;
                default:
                    elements.Add(section.Elements[i]);
                    break;
            }
        }
 
        Commit();
 
        void Commit()
        {
            if (elements.Count > 0)
            {
                foreach (var chunk in _elementsChunker.Process(document, context, elements))
                {
                    chunks.Add(chunk);
                }
                elements.Clear();
            }
        }
    }
}