5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1322
return new
TiktokenTokenizer
(
1350
=> new
TiktokenTokenizer
(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1370
=> new
TiktokenTokenizer
(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1401
return new
TiktokenTokenizer
(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1465
return new
TiktokenTokenizer
(vocabStream,
115 references to TiktokenTokenizer
Microsoft.Extensions.AI.Integration.Tests (1)
ReducingChatClientTests.cs (1)
21
private static readonly Tokenizer _gpt4oTokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Microsoft.Extensions.DataIngestion.Tests (19)
Chunkers\ChunkerOptionsTests.cs (1)
12
private static readonly Tokenizer _tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
Chunkers\HeaderChunkerTests.cs (8)
39
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")));
71
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 13 });
96
HeaderChunker lessThanContext = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 5 });
99
HeaderChunker sameAsContext = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 6 });
119
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 30 });
136
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 37 });
146
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 100 });
175
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
Chunkers\NoOverlapTokenChunkerTests.cs (2)
16
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Chunkers\OverlapTokenChunkerTests.cs (4)
17
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
25
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Chunkers\SectionChunkerTests.cs (2)
16
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Chunkers\SemanticSimilarityChunkerTests.cs (1)
26
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
IngestionPipelineTests.cs (1)
226
private static IngestionChunker<string> CreateChunker() => new HeaderChunker(new(
TiktokenTokenizer
.CreateForModel("gpt-4")));
Microsoft.ML.GenAI.LLaMA (3)
LlamaTokenizerHelper.cs (3)
43
/// Create <see cref="
TiktokenTokenizer
"/> from tokenizer model file.
47
public static
TiktokenTokenizer
FromPretrained(
53
return
TiktokenTokenizer
.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
Microsoft.ML.GenAI.LLaMA.Tests (1)
LLaMA3_1Tests.cs (1)
68
var
tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
Microsoft.ML.GenAI.Samples (7)
Llama\LlamaSample.cs (2)
36
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
39
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Llama\SFT_Llama_3_2_1B.cs (3)
81
public static ICausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM> LoadModel(string weightFolder, string checkPointName = "model.safetensors.index.json")
91
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
94
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
MEAI\Llama3_1.cs (2)
37
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
40
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Microsoft.ML.Tokenizers (17)
Model\CodeGenTokenizer.cs (1)
1879
new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117
vocabStream, mergesStream, new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (13)
1289
private static
TiktokenTokenizer
CreateForModel(
1344
public static
TiktokenTokenizer
Create(
1364
public static
TiktokenTokenizer
Create(
1385
public static async Task<
TiktokenTokenizer
> CreateAsync(
1417
public static async Task<
TiktokenTokenizer
> CreateAsync(
1443
public static
TiktokenTokenizer
CreateForModel(
1482
public static async Task<
TiktokenTokenizer
> CreateForModelAsync(
1519
public static
TiktokenTokenizer
CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1529
public static
TiktokenTokenizer
CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1563
throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {
TiktokenTokenizer
.Cl100kBaseEncodingName}, {
TiktokenTokenizer
.P50kBaseEncodingName}, {
TiktokenTokenizer
.P50kEditEncodingName}, and {
TiktokenTokenizer
.R50kBaseEncodingName}.", nameof(encodingName));
PreTokenizer\RobertaPreTokenizer.cs (2)
32
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
47
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
Microsoft.ML.Tokenizers.Data.Tests (3)
TokenizerDataTests.cs (3)
30
var exception = Record.Exception(() =>
TiktokenTokenizer
.CreateForModel(modelName));
53
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
Microsoft.ML.Tokenizers.Tests (64)
TiktokenTests.cs (64)
32
public static Tokenizer GPT4 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4", _specialTokens);
33
public static Tokenizer GPT2 { get; } =
TiktokenTokenizer
.CreateForModel("gpt2");
34
public static Tokenizer P50kBase { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-003");
35
public static Tokenizer R50kBase { get; } =
TiktokenTokenizer
.CreateForModel("ada");
36
public static Tokenizer P50kEdit { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-edit-001");
37
public static Tokenizer GPT4o { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4o");
38
public static Tokenizer GPT5 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5");
39
public static Tokenizer GPT5_1 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5.1");
40
public static Tokenizer GPT5_2 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5.2");
41
public static Tokenizer Phi4 { get; } =
TiktokenTokenizer
.CreateForModel("phi-4");
42
public static
TiktokenTokenizer
GptOss { get; } =
TiktokenTokenizer
.CreateForModel("gpt-oss-20b");
50
Assert.True(GPT4 is
TiktokenTokenizer
);
51
IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as
TiktokenTokenizer
)!.SpecialTokens;
55
string assemblyName = typeof(
TiktokenTokenizer
).Assembly.FullName!;
66
Tokenizer tokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
71
tokenizer =
TiktokenTokenizer
.Create(stream, GPT4.PreTokenizer, null, specialTokens);
75
tokenizer = await
TiktokenTokenizer
.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
80
tokenizer = await
TiktokenTokenizer
.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
86
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4", stream);
92
tokenizer = await
TiktokenTokenizer
.CreateForModelAsync("gpt-3.5-turbo", stream);
96
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
123
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
124
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
148
TestDecodingWithSpan((tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
165
private void TestDecodingWithSpan(
TiktokenTokenizer
tokenizer, int[] ids, string expectedDecoded)
201
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
232
TestDecodingWithSpan((gpt4Tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
244
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
279
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
291
foreach (
TiktokenTokenizer
tokenizer in new[] { GPT4o, GptOss, GPT5, GPT5_1, GPT5_2 })
344
TestDecodingWithSpan((GPT2 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
363
TestDecodingWithSpan((P50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
382
TestDecodingWithSpan((P50kEdit as
TiktokenTokenizer
)!, encoded.ToArray(), text);
401
TestDecodingWithSpan((R50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
483
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(modelName);
484
Assert.True(tokenizer is
TiktokenTokenizer
);
497
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForEncoding(encodingName);
498
Assert.True(tokenizer is
TiktokenTokenizer
);
512
Tokenizer tokenizer1 =
TiktokenTokenizer
.CreateForModel(modelName);
514
Assert.True(tokenizer is
TiktokenTokenizer
);
515
Assert.True(tokenizer1 is
TiktokenTokenizer
);
517
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
518
TiktokenTokenizer
tiktoken1 = (tokenizer1 as
TiktokenTokenizer
)!;
529
Assert.Throws<ArgumentNullException>(() =>
TiktokenTokenizer
.CreateForEncoding(null!));
530
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("r50k_base_"));
531
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_base_"));
532
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_edit_"));
533
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("cl100k_base_"));
534
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_base_"));
535
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_harmony_"));
561
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(name);
562
Assert.True(tokenizer is
TiktokenTokenizer
);
566
int entriesCount = GetEncoder((tokenizer as
TiktokenTokenizer
)!)!.Count;
843
private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(
TiktokenTokenizer
tiktoken)
844
=> typeof(
TiktokenTokenizer
).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
846
private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(
TiktokenTokenizer
tiktoken)
847
=> typeof(
TiktokenTokenizer
).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
849
private static IReadOnlyDictionary<string, int>? GetVocabulary(
TiktokenTokenizer
tiktoken)
850
=> typeof(
TiktokenTokenizer
).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;