File: Chunkers\IngestionChunkerOptions.cs
Web Access
Project: src\src\Libraries\Microsoft.Extensions.DataIngestion\Microsoft.Extensions.DataIngestion.csproj (Microsoft.Extensions.DataIngestion)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using Microsoft.ML.Tokenizers;
using Microsoft.Shared.Diagnostics;
 
namespace Microsoft.Extensions.DataIngestion;
 
/// <summary>
/// Options for configuring the ingestion chunker.
/// </summary>
public class IngestionChunkerOptions
{
    // Default values come from https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-chunk-documents#text-split-skill-example
    private const int DefaultOverlapTokens = 500;
    private const int DefaultTokensPerChunk = 2_000;
    private int? _overlapTokens;
 
    /// <summary>
    /// Initializes a new instance of the <see cref="IngestionChunkerOptions"/> class.
    /// </summary>
    /// <param name="tokenizer">The tokenizer to use for tokenizing input.</param>
    public IngestionChunkerOptions(Tokenizer tokenizer)
    {
        Tokenizer = Throw.IfNull(tokenizer);
    }
 
    /// <summary>
    /// Gets the <see cref="Tokenizer"/> instance used to process and tokenize input data.
    /// </summary>
    public Tokenizer Tokenizer { get; }
 
    /// <summary>
    /// Gets or sets the maximum number of tokens allowed in each chunk. Default is 2000.
    /// </summary>
    public int MaxTokensPerChunk
    {
        get => field == default ? DefaultTokensPerChunk : field;
        set
        {
            _ = Throw.IfLessThanOrEqual(value, 0);
 
            if (_overlapTokens.HasValue && value <= _overlapTokens.Value)
            {
                Throw.ArgumentOutOfRangeException(nameof(value), "Chunk size must be greater than chunk overlap.");
            }
 
            field = value;
        }
    }
 
    /// <summary>
    /// Gets or sets the number of overlapping tokens between consecutive chunks. Default is 500.
    /// </summary>
    public int OverlapTokens
    {
        get
        {
            if (_overlapTokens.HasValue)
            {
                return _overlapTokens.Value;
            }
            else if (MaxTokensPerChunk > DefaultOverlapTokens)
            {
                return DefaultOverlapTokens;
            }
            else
            {
                return 0;
            }
        }
        set
        {
            if (Throw.IfLessThan(value, 0) >= MaxTokensPerChunk)
            {
                Throw.ArgumentOutOfRangeException(nameof(value), "Chunk overlap must be less than chunk size.");
            }
 
            _overlapTokens = value;
        }
    }
}