File: Chunkers\SectionChunkerTests.cs
Web Access
Project: src\test\Libraries\Microsoft.Extensions.DataIngestion.Tests\Microsoft.Extensions.DataIngestion.Tests.csproj (Microsoft.Extensions.DataIngestion.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.ML.Tokenizers;
using Xunit;
 
namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
{
    public class SectionChunkerTests : DocumentChunkerTests
    {
        protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500)
        {
            var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
            return new SectionChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = overlapTokens });
        }
 
        [Fact]
        public async Task OneSection()
        {
            IngestionDocument doc = new IngestionDocument("doc");
            doc.Sections.Add(new IngestionDocumentSection
            {
                Elements =
                {
                    new IngestionDocumentParagraph("This is a paragraph."),
                    new IngestionDocumentParagraph("This is another paragraph.")
                }
            });
            IngestionChunker<string> chunker = CreateDocumentChunker();
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
            Assert.Single(chunks);
            string expectedResult = "This is a paragraph.\nThis is another paragraph.";
            Assert.Equal(expectedResult, chunks[0].Content, ignoreLineEndingDifferences: true);
        }
 
        [Fact]
        public async Task TwoSections()
        {
            IngestionDocument doc = new("doc")
            {
                Sections =
                {
                    new()
                    {
                        Elements =
                        {
                            new IngestionDocumentParagraph("This is a paragraph."),
                            new IngestionDocumentParagraph("This is another paragraph.")
                        }
                    },
                    new()
                    {
                        Elements =
                        {
                            new IngestionDocumentParagraph("This is a paragraph in section 2."),
                            new IngestionDocumentParagraph("This is another paragraph in section 2.")
                        }
                    }
                }
            };
 
            IngestionChunker<string> chunker = CreateDocumentChunker();
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
 
            Assert.Equal(2, chunks.Count);
            string expectedResult1 = "This is a paragraph.\nThis is another paragraph.";
            string expectedResult2 = "This is a paragraph in section 2.\nThis is another paragraph in section 2.";
            Assert.Equal(expectedResult1, chunks[0].Content, ignoreLineEndingDifferences: true);
            Assert.Equal(expectedResult2, chunks[1].Content, ignoreLineEndingDifferences: true);
        }
 
        [Fact]
        public async Task EmptySection()
        {
            IngestionDocument doc = new IngestionDocument("doc");
            doc.Sections.Add(new IngestionDocumentSection
            {
                Elements = { }
            });
            IngestionChunker<string> chunker = CreateDocumentChunker();
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
            Assert.Empty(chunks);
        }
 
        [Fact]
        public async Task NestedSections()
        {
            IngestionDocument doc = new("doc")
            {
                Sections =
                {
                    new()
                    {
                        Elements =
                        {
                            new IngestionDocumentHeader("# Section title"),
                            new IngestionDocumentParagraph("This is a paragraph in section 1."),
                            new IngestionDocumentParagraph("This is another paragraph in section 1."),
                            new IngestionDocumentSection
                            {
                                Elements =
                                {
                                    new IngestionDocumentHeader("## Subsection title"),
                                    new IngestionDocumentParagraph("This is a paragraph in subsection 1.1."),
                                    new IngestionDocumentParagraph("This is another paragraph in subsection 1.1."),
                                    new IngestionDocumentSection
                                    {
                                        Elements =
                                        {
                                            new IngestionDocumentHeader("### Subsubsection title"),
                                            new IngestionDocumentParagraph("This is a paragraph in subsubsection 1.1.1."),
                                            new IngestionDocumentParagraph("This is another paragraph in subsubsection 1.1.1.")
                                        }
                                    },
                                    new IngestionDocumentParagraph("This is the last paragraph in subsection 1.2."),
                                }
                            }
                        }
                    }
                }
            };
 
            IngestionChunker<string> chunker = CreateDocumentChunker();
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
 
            Assert.Equal(4, chunks.Count);
            Assert.Equal("# Section title", chunks[0].Context);
            Assert.Equal("# Section title\nThis is a paragraph in section 1.\nThis is another paragraph in section 1.",
                chunks[0].Content, ignoreLineEndingDifferences: true);
            Assert.Equal("# Section title ## Subsection title", chunks[1].Context);
            Assert.Equal("# Section title ## Subsection title\nThis is a paragraph in subsection 1.1.\nThis is another paragraph in subsection 1.1.",
                chunks[1].Content, ignoreLineEndingDifferences: true);
            Assert.Equal("# Section title ## Subsection title ### Subsubsection title", chunks[2].Context);
            Assert.Equal("# Section title ## Subsection title ### Subsubsection title\nThis is a paragraph in subsubsection 1.1.1.\nThis is another paragraph in subsubsection 1.1.1.",
                chunks[2].Content, ignoreLineEndingDifferences: true);
            Assert.Equal("# Section title ## Subsection title", chunks[3].Context);
            Assert.Equal("# Section title ## Subsection title\nThis is the last paragraph in subsection 1.2.", chunks[3].Content, ignoreLineEndingDifferences: true);
        }
 
        [Fact]
        public async Task SizeLimit_TwoChunks()
        {
            string text = string.Join(" ", Enumerable.Repeat("word", 600)); // each word is 1 token
            IngestionDocument doc = new IngestionDocument("twoChunksNoOverlapDoc");
            doc.Sections.Add(new IngestionDocumentSection
            {
                Elements =
                {
                    new IngestionDocumentParagraph(text)
                }
            });
            IngestionChunker<string> chunker = CreateDocumentChunker(maxTokensPerChunk: 512);
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
            Assert.Equal(2, chunks.Count);
            Assert.True(chunks[0].Content.Split(' ').Length <= 512);
            Assert.True(chunks[1].Content.Split(' ').Length <= 512);
            Assert.Equal(text, string.Join("", chunks.Select(c => c.Content)), ignoreLineEndingDifferences: true);
        }
 
        [Fact]
        public async Task SectionWithHeader()
        {
            IngestionDocument doc = new IngestionDocument("doc");
            doc.Sections.Add(new IngestionDocumentSection
            {
                Elements =
                {
                    new IngestionDocumentHeader("Section 1"),
                    new IngestionDocumentParagraph("This is a paragraph in section 1."),
                    new IngestionDocumentParagraph("This is another paragraph in section 1.")
                }
            });
            IngestionChunker<string> chunker = CreateDocumentChunker();
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
            IngestionChunk<string> chunk = Assert.Single(chunks);
            string expectedResult = "Section 1\nThis is a paragraph in section 1.\nThis is another paragraph in section 1.";
            Assert.Equal(expectedResult, chunk.Content, ignoreLineEndingDifferences: true);
            Assert.Equal("Section 1", chunk.Context);
        }
    }
}