File: Chunkers\SemanticSimilarityChunkerTests.cs
Web Access
Project: src\test\Libraries\Microsoft.Extensions.DataIngestion.Tests\Microsoft.Extensions.DataIngestion.Tests.csproj (Microsoft.Extensions.DataIngestion.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.Extensions.AI;
using Microsoft.ML.Tokenizers;
using Xunit;
 
namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
{
    public class SemanticSimilarityChunkerTests : DocumentChunkerTests
    {
        protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500)
        {
#pragma warning disable CA2000 // Dispose objects before losing scope
            TestEmbeddingGenerator embeddingClient = new();
#pragma warning restore CA2000 // Dispose objects before losing scope
            return CreateSemanticSimilarityChunker(embeddingClient, maxTokensPerChunk, overlapTokens);
        }
 
        private static IngestionChunker<string> CreateSemanticSimilarityChunker(TestEmbeddingGenerator embeddingClient, int maxTokensPerChunk = 2_000, int overlapTokens = 500)
        {
            Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
            return new SemanticSimilarityChunker(embeddingClient,
                new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = overlapTokens });
        }
 
        [Fact]
        public async Task SingleParagraph()
        {
            string text = ".NET is a free, cross-platform, open-source developer platform for building many " +
                "kinds of applications. It can run programs written in multiple languages, with C# being the most popular. " +
                "It relies on a high-performance runtime that is used in production by many high-scale apps.";
            IngestionDocument doc = new IngestionDocument("doc");
            doc.Sections.Add(new IngestionDocumentSection
            {
                Elements =
                {
                    new IngestionDocumentParagraph(text)
                }
            });
            using TestEmbeddingGenerator customGenerator = new()
            {
                GenerateAsyncCallback = static async (values, options, ct) =>
                {
                    var embeddings = values.Select(v =>
                        new Embedding<float>(new float[] { 1.0f, 2.0f, 3.0f, 4.0f }))
                        .ToArray();
 
                    return [.. embeddings];
                }
            };
            IngestionChunker<string> chunker = CreateSemanticSimilarityChunker(customGenerator);
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
            Assert.Single(chunks);
            Assert.Equal(text, chunks[0].Content);
        }
 
        [Fact]
        public async Task TwoTopicsParagraphs()
        {
            IngestionDocument doc = new IngestionDocument("doc");
            string text1 = ".NET is a free, cross-platform, open-source developer platform for building many" +
                "kinds of applications. It can run programs written in multiple languages, with C# being the most popular.";
            string text2 = "It relies on a high-performance runtime that is used in production by many high-scale apps.";
            string text3 = "Zeus is the chief deity of the Greek pantheon. He is a sky and thunder god in ancient Greek religion and mythology.";
            doc.Sections.Add(new IngestionDocumentSection
            {
                Elements =
                {
                    new IngestionDocumentParagraph(text1),
                    new IngestionDocumentParagraph(text2),
                    new IngestionDocumentParagraph(text3)
                }
            });
 
            using var customGenerator = new TestEmbeddingGenerator
            {
                GenerateAsyncCallback = async (values, options, ct) =>
                {
                    var embeddings = values.Select((_, index) =>
                    {
                        return index switch
                        {
                            0 => new Embedding<float>(new float[] { 1.0f, 1.0f, 1.0f, 1.0f }),
                            1 => new Embedding<float>(new float[] { 1.0f, 1.0f, 1.0f, 1.0f }),
                            2 => new Embedding<float>(new float[] { -1.0f, -1.0f, -1.0f, -1.0f }),
                            _ => throw new InvalidOperationException("Unexpected call count")
                        };
                    }).ToArray();
 
                    return [.. embeddings];
                }
            };
 
            IngestionChunker<string> chunker = CreateSemanticSimilarityChunker(customGenerator);
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
            Assert.Equal(2, chunks.Count);
            Assert.Equal(text1 + Environment.NewLine + text2, chunks[0].Content);
            Assert.Equal(text3, chunks[1].Content);
        }
 
        [Fact]
        public async Task TwoSeparateTopicsWithAllKindsOfElements()
        {
            string dotNetTableMarkdown = """
            | Language | Type | Status |
            | --- | --- | --- |
            | C# | Object-oriented | Primary |
            | F# | Functional | Official |
            | Visual Basic | Object-oriented | Official |
            | PowerShell | Scripting | Supported |
            | IronPython | Dynamic | Community |
            | IronRuby | Dynamic | Community |
            | Boo | Object-oriented | Community |
            | Nemerle | Functional/OOP | Community |
            """;
 
            string godsTableMarkdown = """
            | God | Domain | Symbol | Roman Name |
            | --- | --- | --- | --- |
            | Zeus | Sky & Thunder | Lightning Bolt | Jupiter |
            | Hera | Marriage & Family | Peacock | Juno |
            | Poseidon | Sea & Earthquakes | Trident | Neptune |
            | Athena | Wisdom & War | Owl | Minerva |
            | Apollo | Sun & Music | Lyre | Apollo |
            | Artemis | Hunt & Moon | Silver Bow | Diana |
            | Aphrodite | Love & Beauty | Dove | Venus |
            | Ares | War & Courage | Spear | Mars |
            | Hephaestus | Fire & Forge | Hammer | Vulcan |
            | Demeter | Harvest & Nature | Wheat | Ceres |
            | Dionysus | Wine & Festivity | Grapes | Bacchus |
            | Hermes | Messages & Trade | Caduceus | Mercury |
            """;
 
            IngestionDocument doc = new("dotnet-languages");
            doc.Sections.Add(new IngestionDocumentSection
            {
                Elements =
                {
                    new IngestionDocumentHeader("# .NET Supported Languages") { Level = 1 },
                    new IngestionDocumentParagraph("The .NET platform supports multiple programming languages:"),
                    new IngestionDocumentTable(dotNetTableMarkdown,
                        ToParagraphCells(CreateLanguageTableCells())),
                    new IngestionDocumentParagraph("C# remains the most popular language for .NET development."),
                    new IngestionDocumentHeader("# Ancient Greek Olympian Gods") { Level = 1 },
                    new IngestionDocumentParagraph("The twelve Olympian gods were the principal deities of the Greek pantheon:"),
                    new IngestionDocumentTable(godsTableMarkdown,
                        ToParagraphCells(CreateGreekGodsTableCells())),
                    new IngestionDocumentParagraph("These gods resided on Mount Olympus and ruled over different aspects of mortal and divine life.")
                }
            });
 
            using var customGenerator = new TestEmbeddingGenerator
            {
                GenerateAsyncCallback = async (values, options, ct) =>
                {
                    var embeddings = values.Select((_, index) =>
                    {
                        return index switch
                        {
                            <= 3 => new Embedding<float>(new float[] { 1.0f, 1.0f, 1.0f, 1.0f }),
                            >= 4 and <= 7 => new Embedding<float>(new float[] { -1.0f, -1.0f, -1.0f, -1.0f }),
                            _ => throw new InvalidOperationException($"Unexpected index: {index}")
                        };
                    }).ToArray();
 
                    return [.. embeddings];
                }
            };
 
            IngestionChunker<string> chunker = CreateSemanticSimilarityChunker(customGenerator, 200, 0);
            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
 
            Assert.Equal(3, chunks.Count);
            Assert.All(chunks, chunk => Assert.Same(doc, chunk.Document));
            Assert.Equal($@"# .NET Supported Languages
The .NET platform supports multiple programming languages:
{dotNetTableMarkdown}
C# remains the most popular language for .NET development.",
            chunks[0].Content, ignoreLineEndingDifferences: true);
            Assert.Equal($@"# Ancient Greek Olympian Gods
The twelve Olympian gods were the principal deities of the Greek pantheon:
| God | Domain | Symbol | Roman Name |
| --- | --- | --- | --- |
| Zeus | Sky & Thunder | Lightning Bolt | Jupiter |
| Hera | Marriage & Family | Peacock | Juno |
| Poseidon | Sea & Earthquakes | Trident | Neptune |
| Athena | Wisdom & War | Owl | Minerva |
| Apollo | Sun & Music | Lyre | Apollo |
| Artemis | Hunt & Moon | Silver Bow | Diana |
| Aphrodite | Love & Beauty | Dove | Venus |
| Ares | War & Courage | Spear | Mars |
| Hephaestus | Fire & Forge | Hammer | Vulcan |
| Demeter | Harvest & Nature | Wheat | Ceres |
| Dionysus | Wine & Festivity | Grapes | Bacchus |",
            chunks[1].Content, ignoreLineEndingDifferences: true);
            Assert.Equal("""
            | God | Domain | Symbol | Roman Name |
            | --- | --- | --- | --- |
            | Hermes | Messages & Trade | Caduceus | Mercury |
            These gods resided on Mount Olympus and ruled over different aspects of mortal and divine life.
            """, chunks[2].Content, ignoreLineEndingDifferences: true);
 
            static string[,] CreateGreekGodsTableCells() => new string[,]
                {
                    { "God", "Domain", "Symbol", "Roman Name" },
                    { "Zeus", "Sky & Thunder", "Lightning Bolt", "Jupiter" },
                    { "Hera", "Marriage & Family", "Peacock", "Juno" },
                    { "Poseidon", "Sea & Earthquakes", "Trident", "Neptune" },
                    { "Athena", "Wisdom & War", "Owl", "Minerva" },
                    { "Apollo", "Sun & Music", "Lyre", "Apollo" },
                    { "Artemis", "Hunt & Moon", "Silver Bow", "Diana" },
                    { "Aphrodite", "Love & Beauty", "Dove", "Venus" },
                    { "Ares", "War & Courage", "Spear", "Mars" },
                    { "Hephaestus", "Fire & Forge", "Hammer", "Vulcan" },
                    { "Demeter", "Harvest & Nature", "Wheat", "Ceres" },
                    { "Dionysus", "Wine & Festivity", "Grapes", "Bacchus" },
                    { "Hermes", "Messages & Trade", "Caduceus", "Mercury" }
                };
 
            static string[,] CreateLanguageTableCells() => new string[,]
                {
                    { "Language", "Type", "Status" },
                    { "C#", "Object-oriented", "Primary" },
                    { "F#", "Functional", "Official" },
                    { "Visual Basic", "Object-oriented", "Official" },
                    { "PowerShell", "Scripting", "Supported" },
                    { "IronPython", "Dynamic", "Community" },
                    { "IronRuby", "Dynamic", "Community" },
                    { "Boo", "Object-oriented", "Community" },
                    { "Nemerle", "Functional/OOP", "Community" }
                };
        }
 
        private static IngestionDocumentParagraph?[,] ToParagraphCells(string[,] cells)
        {
            int rows = cells.GetLength(0);
            int cols = cells.GetLength(1);
            var result = new IngestionDocumentParagraph?[rows, cols];
            for (int i = 0; i < rows; i++)
            {
                for (int j = 0; j < cols; j++)
                {
                    result[i, j] = new IngestionDocumentParagraph(cells[i, j]);
                }
            }
 
            return result;
        }
    }
}