File: TokenizerDataTests.cs
Web Access
Project: src\test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj (Microsoft.ML.Tokenizers.Data.Tests)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Buffers;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using Xunit;
 
namespace Microsoft.ML.Tokenizers.Tests
{
    public class TokenizerDataTests
    {
        [Theory]
        [InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")]            // O200kBase
        [InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")]            // Cl100kBase
        [InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")]   // P50kBase
        [InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")]   // R50kBase
        [InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")]                   // Gpt2
        public void TestMissingDataPackages(string modelName, string packageName)
        {
            var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName));
            Assert.NotNull(exception);
            Assert.Contains(packageName, exception.Message);
        }
 
        public static IEnumerable<object[]> ModelUrlData()
        {
            // Gpt2 is covered by the r50k_base.tiktoken file
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
        }
 
        [Theory]
        [MemberData(nameof(ModelUrlData))]
        public async Task TestTokenizerCreationWithProvidedData(string url)
        {
            string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
            await Utils.DownloadFile(url, tokenizerDataFileName);
 
            try
            {
                TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
                Assert.NotNull(externalTokenizer);
            }
            finally
            {
                Utils.DeleteFile(tokenizerDataFileName);
            }
        }
    }
}