File: SimpleTokenizerTests.cs
Web Access
Project: src\test\Libraries\Microsoft.Extensions.AI.Evaluation.NLP.Tests\Microsoft.Extensions.AI.Evaluation.NLP.Tests.csproj (Microsoft.Extensions.AI.Evaluation.NLP.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
 
using Microsoft.Extensions.AI.Evaluation.NLP.Common;
using Xunit;
 
namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests;
 
#pragma warning disable AIEVAL001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
 
public class SimpleTokenizerTests
{
    [Theory]
    [InlineData(" $41.23 ", new[] { "$", "41.23" })]
    [InlineData("word", new[] { "WORD" })]
    [InlineData("word1 word2", new[] { "WORD1", "WORD2" })]
    [InlineData("word1,word2", new[] { "WORD1", ",", "WORD2" })]
    [InlineData("word1.word2", new[] { "WORD1", ".", "WORD2" })]
    [InlineData("word1!word2?", new[] { "WORD1", "!", "WORD2", "?" })]
    [InlineData("word1-word2", new[] { "WORD1", "-", "WORD2" })]
    [InlineData("word1 - word2", new[] { "WORD1", "-", "WORD2" })]
    [InlineData("word1-\n word2", new[] { "WORD1", "WORD2" })]
    [InlineData("word1-", new[] { "WORD1", "-" })]
    [InlineData("word1&", new[] { "WORD1", "&" })]
    [InlineData("word1-\r\n word2", new[] { "WORD1", "WORD2" })]
    [InlineData("word1-\r\nword2", new[] { "WORD1WORD2" })]
    [InlineData("word1-\nword2", new[] { "WORD1WORD2" })]
    [InlineData("word1\nword2", new[] { "WORD1", "WORD2" })]
    [InlineData("word1 \n word2", new[] { "WORD1", "WORD2" })]
    [InlineData("word1\r\nword2", new[] { "WORD1", "WORD2" })]
    [InlineData("word1 \r\n word2", new[] { "WORD1", "WORD2" })]
    [InlineData("word1\tword2", new[] { "WORD1", "WORD2" })]
    [InlineData("It is a guide to action that ensures that the military will forever heed Party commands.",
        new[] { "IT", "IS", "A", "GUIDE", "TO", "ACTION", "THAT", "ENSURES", "THAT", "THE", "MILITARY", "WILL", "FOREVER", "HEED", "PARTY", "COMMANDS", "." })]
    [InlineData("Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.",
        new[] { "GOOD", "MUFFINS", "COST", "$", "3.88", "(", "ROUGHLY", "3,36", "EUROS", ")", "IN", "NEW", "YORK", ".", "PLEASE", "BUY", "ME", "TWO", "OF", "THEM", ".", "THANKS", "." })]
    [InlineData("", new string[0])]
    [InlineData(" This is a test.", new[] { "THIS", "IS", "A", "TEST", "." })]
    [InlineData("Hello, world! How's it going?", new[] { "HELLO", ",", "WORLD", "!", "HOW", "'", "S", "IT", "GOING", "?" })]
    [InlineData("&quot;Quotes&quot; and &amp; symbols &lt; &gt; &apos;", new[] { "\"", "QUOTES", "\"", "AND", "&", "SYMBOLS", "<", ">", "'" })]
    [InlineData("-\nThis is a test.", new[] { "THIS", "IS", "A", "TEST", "." })]
    public void Tokenize_Cases(string input, string[] expected)
    {
        var result = SimpleWordTokenizer.WordTokenize(input);
        Assert.Equal(expected, result);
    }
 
    [Fact]
    public void HandlesMultipleSpacesAndEmptyEntries()
    {
        var input = "   word1   word2    word3   ";
        var expected = new[] { "WORD1", "WORD2", "WORD3" };
        var result = SimpleWordTokenizer.WordTokenize(input);
        Assert.Equal(expected, result);
    }
 
    [Fact]
    public void HandlesUnicodeSymbolsAndPunctuation()
    {
        var input = "word1 © word2 ™ word3 — word4";
        var expected = new[] { "WORD1", "©", "WORD2", "™", "WORD3", "—", "WORD4" };
        var result = SimpleWordTokenizer.WordTokenize(input);
        Assert.Equal(expected, result);
    }
 
    [Fact]
    public void HandlesHtmlEntities()
    {
        var input = "&quot;Hello&quot; &amp; Goodbye &lt;test&gt; &apos;";
        var expected = new[] { "\"", "HELLO", "\"", "&", "GOODBYE", "<", "TEST", ">", "'" };
        var result = SimpleWordTokenizer.WordTokenize(input);
        Assert.Equal(expected, result);
    }
}