5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1318return new TiktokenTokenizer(
1346=> new TiktokenTokenizer(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1366=> new TiktokenTokenizer(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1397return new TiktokenTokenizer(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1461return new TiktokenTokenizer(vocabStream,
94 references to TiktokenTokenizer
Microsoft.Extensions.AI.Integration.Tests (1)
Microsoft.ML.GenAI.LLaMA (3)
Microsoft.ML.GenAI.LLaMA.Tests (1)
Microsoft.ML.GenAI.Samples (7)
Microsoft.ML.Tokenizers (17)
Model\TiktokenTokenizer.cs (13)
1285private static TiktokenTokenizer CreateForModel(
1340public static TiktokenTokenizer Create(
1360public static TiktokenTokenizer Create(
1381public static async Task<TiktokenTokenizer> CreateAsync(
1413public static async Task<TiktokenTokenizer> CreateAsync(
1439public static TiktokenTokenizer CreateForModel(
1478public static async Task<TiktokenTokenizer> CreateForModelAsync(
1515public static TiktokenTokenizer CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1525public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1559throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {TiktokenTokenizer.Cl100kBaseEncodingName}, {TiktokenTokenizer.P50kBaseEncodingName}, {TiktokenTokenizer.P50kEditEncodingName}, and {TiktokenTokenizer.R50kBaseEncodingName}.", nameof(encodingName));
Microsoft.ML.Tokenizers.Data.Tests (3)
Microsoft.ML.Tokenizers.Tests (62)
TiktokenTests.cs (62)
32public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens);
33public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2");
34public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003");
35public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada");
36public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
37public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
38public static Tokenizer GPT5 { get; } = TiktokenTokenizer.CreateForModel("gpt-5");
39public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4");
40public static TiktokenTokenizer GptOss { get; } = TiktokenTokenizer.CreateForModel("gpt-oss-20b");
48Assert.True(GPT4 is TiktokenTokenizer);
49IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
53string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
64Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
69tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens);
73tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
78tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
84tokenizer = TiktokenTokenizer.CreateForModel("gpt-4", stream);
90tokenizer = await TiktokenTokenizer.CreateForModelAsync("gpt-3.5-turbo", stream);
94tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
121TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
122TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
146TestDecodingWithSpan((tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
163private void TestDecodingWithSpan(TiktokenTokenizer tokenizer, int[] ids, string expectedDecoded)
199TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
230TestDecodingWithSpan((gpt4Tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
242TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
277TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
289foreach (TiktokenTokenizer tokenizer in new[] { GPT4o, GptOss, GPT5 })
342TestDecodingWithSpan((GPT2 as TiktokenTokenizer)!, encoded.ToArray(), text);
361TestDecodingWithSpan((P50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
380TestDecodingWithSpan((P50kEdit as TiktokenTokenizer)!, encoded.ToArray(), text);
399TestDecodingWithSpan((R50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
477Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName);
478Assert.True(tokenizer is TiktokenTokenizer);
491Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName);
492Assert.True(tokenizer is TiktokenTokenizer);
506Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName);
508Assert.True(tokenizer is TiktokenTokenizer);
509Assert.True(tokenizer1 is TiktokenTokenizer);
511TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
512TiktokenTokenizer tiktoken1 = (tokenizer1 as TiktokenTokenizer)!;
523Assert.Throws<ArgumentNullException>(() => TiktokenTokenizer.CreateForEncoding(null!));
524Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("r50k_base_"));
525Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_base_"));
526Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_edit_"));
527Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("cl100k_base_"));
528Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_base_"));
529Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_harmony_"));
553Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name);
554Assert.True(tokenizer is TiktokenTokenizer);
558int entriesCount = GetEncoder((tokenizer as TiktokenTokenizer)!)!.Count;
835private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken)
836=> typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
838private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(TiktokenTokenizer tiktoken)
839=> typeof(TiktokenTokenizer).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
841private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken)
842=> typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;