5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1326return new TiktokenTokenizer( 1354=> new TiktokenTokenizer(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize); 1374=> new TiktokenTokenizer(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize); 1405return new TiktokenTokenizer(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize); 1469return new TiktokenTokenizer(vocabStream,
117 references to TiktokenTokenizer
Microsoft.Extensions.AI.Integration.Tests (1)
ReducingChatClientTests.cs (1)
21private static readonly Tokenizer _gpt4oTokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
Microsoft.Extensions.DataIngestion.Tests (19)
Chunkers\ChunkerOptionsTests.cs (1)
12private static readonly Tokenizer _tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
Chunkers\HeaderChunkerTests.cs (8)
39HeaderChunker chunker = new(new(TiktokenTokenizer.CreateForModel("gpt-4"))); 71HeaderChunker chunker = new(new(TiktokenTokenizer.CreateForModel("gpt-4")) { MaxTokensPerChunk = 13 }); 96HeaderChunker lessThanContext = new(new(TiktokenTokenizer.CreateForModel("gpt-4")) { MaxTokensPerChunk = 5 }); 99HeaderChunker sameAsContext = new(new(TiktokenTokenizer.CreateForModel("gpt-4")) { MaxTokensPerChunk = 6 }); 119HeaderChunker chunker = new(new(TiktokenTokenizer.CreateForModel("gpt-4")) { MaxTokensPerChunk = 30 }); 136HeaderChunker chunker = new(new(TiktokenTokenizer.CreateForModel("gpt-4")) { MaxTokensPerChunk = 37 }); 146HeaderChunker chunker = new(new(TiktokenTokenizer.CreateForModel("gpt-4")) { MaxTokensPerChunk = 100 }); 175Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
Chunkers\NoOverlapTokenChunkerTests.cs (2)
16var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
Chunkers\OverlapTokenChunkerTests.cs (4)
17var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); 25var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
Chunkers\SectionChunkerTests.cs (2)
16var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
Chunkers\SemanticSimilarityChunkerTests.cs (1)
26Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
IngestionPipelineTests.cs (1)
226private static IngestionChunker<string> CreateChunker() => new HeaderChunker(new(TiktokenTokenizer.CreateForModel("gpt-4")));
Microsoft.ML.GenAI.LLaMA (3)
LlamaTokenizerHelper.cs (3)
43/// Create <see cref="TiktokenTokenizer"/> from tokenizer model file. 47public static TiktokenTokenizer FromPretrained( 53return TiktokenTokenizer.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
Microsoft.ML.GenAI.LLaMA.Tests (1)
LLaMA3_1Tests.cs (1)
68var tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
Microsoft.ML.GenAI.Samples (7)
Llama\LlamaSample.cs (2)
36var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder); 39var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
Llama\SFT_Llama_3_2_1B.cs (3)
81public static ICausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM> LoadModel(string weightFolder, string checkPointName = "model.safetensors.index.json") 91var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder); 94var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
MEAI\Llama3_1.cs (2)
37var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder); 40var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
Microsoft.ML.Tokenizers (17)
Model\CodeGenTokenizer.cs (1)
1879new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117vocabStream, mergesStream, new RegexPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (13)
1293private static TiktokenTokenizer CreateForModel( 1348public static TiktokenTokenizer Create( 1368public static TiktokenTokenizer Create( 1389public static async Task<TiktokenTokenizer> CreateAsync( 1421public static async Task<TiktokenTokenizer> CreateAsync( 1447public static TiktokenTokenizer CreateForModel( 1486public static async Task<TiktokenTokenizer> CreateForModelAsync( 1523public static TiktokenTokenizer CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null) 1533public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null) 1567throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {TiktokenTokenizer.Cl100kBaseEncodingName}, {TiktokenTokenizer.P50kBaseEncodingName}, {TiktokenTokenizer.P50kEditEncodingName}, and {TiktokenTokenizer.R50kBaseEncodingName}.", nameof(encodingName));
PreTokenizer\RobertaPreTokenizer.cs (2)
32return SplitText(text, TiktokenTokenizer.P50kBaseRegex()); 47return SplitText(text, TiktokenTokenizer.P50kBaseRegex());
Microsoft.ML.Tokenizers.Data.Tests (3)
TokenizerDataTests.cs (3)
30var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName)); 53TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
Microsoft.ML.Tokenizers.Tests (66)
TiktokenTests.cs (66)
32public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens); 33public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2"); 34public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003"); 35public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada"); 36public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001"); 37public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o"); 38public static Tokenizer GPT5 { get; } = TiktokenTokenizer.CreateForModel("gpt-5"); 39public static Tokenizer GPT5_1 { get; } = TiktokenTokenizer.CreateForModel("gpt-5.1"); 40public static Tokenizer GPT5_2 { get; } = TiktokenTokenizer.CreateForModel("gpt-5.2"); 41public static Tokenizer GPT5_3 { get; } = TiktokenTokenizer.CreateForModel("gpt-5.3"); 42public static Tokenizer GPT5_4 { get; } = TiktokenTokenizer.CreateForModel("gpt-5.4"); 43public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4"); 44public static TiktokenTokenizer GptOss { get; } = TiktokenTokenizer.CreateForModel("gpt-oss-20b"); 52Assert.True(GPT4 is TiktokenTokenizer); 53IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens; 57string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!; 68Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens); 73tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens); 77tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens); 82tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens); 88tokenizer = TiktokenTokenizer.CreateForModel("gpt-4", stream); 94tokenizer = await TiktokenTokenizer.CreateForModelAsync("gpt-3.5-turbo", stream); 98tokenizer = TiktokenTokenizer.CreateForModel("gpt-4"); 125TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!; 126TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens); 150TestDecodingWithSpan((tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text); 167private void TestDecodingWithSpan(TiktokenTokenizer tokenizer, int[] ids, string expectedDecoded) 203TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 234TestDecodingWithSpan((gpt4Tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text); 246TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 281TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 293foreach (TiktokenTokenizer tokenizer in new[] { GPT4o, GptOss, GPT5, GPT5_1, GPT5_2 }) 346TestDecodingWithSpan((GPT2 as TiktokenTokenizer)!, encoded.ToArray(), text); 365TestDecodingWithSpan((P50kBase as TiktokenTokenizer)!, encoded.ToArray(), text); 384TestDecodingWithSpan((P50kEdit as TiktokenTokenizer)!, encoded.ToArray(), text); 403TestDecodingWithSpan((R50kBase as TiktokenTokenizer)!, encoded.ToArray(), text); 489Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName); 490Assert.True(tokenizer is TiktokenTokenizer); 503Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName); 504Assert.True(tokenizer is TiktokenTokenizer); 518Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName); 520Assert.True(tokenizer is TiktokenTokenizer); 521Assert.True(tokenizer1 is TiktokenTokenizer); 523TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!; 524TiktokenTokenizer tiktoken1 = (tokenizer1 as TiktokenTokenizer)!; 535Assert.Throws<ArgumentNullException>(() => TiktokenTokenizer.CreateForEncoding(null!)); 536Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("r50k_base_")); 537Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_base_")); 538Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_edit_")); 539Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("cl100k_base_")); 540Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_base_")); 541Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_harmony_")); 569Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name); 570Assert.True(tokenizer is TiktokenTokenizer); 574int entriesCount = GetEncoder((tokenizer as TiktokenTokenizer)!)!.Count; 851private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken) 852=> typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>; 854private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(TiktokenTokenizer tiktoken) 855=> typeof(TiktokenTokenizer).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>; 857private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken) 858=> typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;