5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1324
return new
TiktokenTokenizer
(
1352
=> new
TiktokenTokenizer
(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1372
=> new
TiktokenTokenizer
(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1403
return new
TiktokenTokenizer
(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1467
return new
TiktokenTokenizer
(vocabStream,
116 references to TiktokenTokenizer
Microsoft.Extensions.AI.Integration.Tests (1)
ReducingChatClientTests.cs (1)
21
private static readonly Tokenizer _gpt4oTokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Microsoft.Extensions.DataIngestion.Tests (19)
Chunkers\ChunkerOptionsTests.cs (1)
12
private static readonly Tokenizer _tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
Chunkers\HeaderChunkerTests.cs (8)
39
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")));
71
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 13 });
96
HeaderChunker lessThanContext = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 5 });
99
HeaderChunker sameAsContext = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 6 });
119
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 30 });
136
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 37 });
146
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 100 });
175
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
Chunkers\NoOverlapTokenChunkerTests.cs (2)
16
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Chunkers\OverlapTokenChunkerTests.cs (4)
17
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
25
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Chunkers\SectionChunkerTests.cs (2)
16
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Chunkers\SemanticSimilarityChunkerTests.cs (1)
26
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
IngestionPipelineTests.cs (1)
226
private static IngestionChunker<string> CreateChunker() => new HeaderChunker(new(
TiktokenTokenizer
.CreateForModel("gpt-4")));
Microsoft.ML.GenAI.LLaMA (3)
LlamaTokenizerHelper.cs (3)
43
/// Create <see cref="
TiktokenTokenizer
"/> from tokenizer model file.
47
public static
TiktokenTokenizer
FromPretrained(
53
return
TiktokenTokenizer
.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
Microsoft.ML.GenAI.LLaMA.Tests (1)
LLaMA3_1Tests.cs (1)
68
var
tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
Microsoft.ML.GenAI.Samples (7)
Llama\LlamaSample.cs (2)
36
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
39
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Llama\SFT_Llama_3_2_1B.cs (3)
81
public static ICausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM> LoadModel(string weightFolder, string checkPointName = "model.safetensors.index.json")
91
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
94
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
MEAI\Llama3_1.cs (2)
37
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
40
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Microsoft.ML.Tokenizers (17)
Model\CodeGenTokenizer.cs (1)
1879
new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117
vocabStream, mergesStream, new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (13)
1291
private static
TiktokenTokenizer
CreateForModel(
1346
public static
TiktokenTokenizer
Create(
1366
public static
TiktokenTokenizer
Create(
1387
public static async Task<
TiktokenTokenizer
> CreateAsync(
1419
public static async Task<
TiktokenTokenizer
> CreateAsync(
1445
public static
TiktokenTokenizer
CreateForModel(
1484
public static async Task<
TiktokenTokenizer
> CreateForModelAsync(
1521
public static
TiktokenTokenizer
CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1531
public static
TiktokenTokenizer
CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1565
throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {
TiktokenTokenizer
.Cl100kBaseEncodingName}, {
TiktokenTokenizer
.P50kBaseEncodingName}, {
TiktokenTokenizer
.P50kEditEncodingName}, and {
TiktokenTokenizer
.R50kBaseEncodingName}.", nameof(encodingName));
PreTokenizer\RobertaPreTokenizer.cs (2)
32
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
47
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
Microsoft.ML.Tokenizers.Data.Tests (3)
TokenizerDataTests.cs (3)
30
var exception = Record.Exception(() =>
TiktokenTokenizer
.CreateForModel(modelName));
53
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
Microsoft.ML.Tokenizers.Tests (65)
TiktokenTests.cs (65)
32
public static Tokenizer GPT4 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4", _specialTokens);
33
public static Tokenizer GPT2 { get; } =
TiktokenTokenizer
.CreateForModel("gpt2");
34
public static Tokenizer P50kBase { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-003");
35
public static Tokenizer R50kBase { get; } =
TiktokenTokenizer
.CreateForModel("ada");
36
public static Tokenizer P50kEdit { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-edit-001");
37
public static Tokenizer GPT4o { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4o");
38
public static Tokenizer GPT5 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5");
39
public static Tokenizer GPT5_1 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5.1");
40
public static Tokenizer GPT5_2 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5.2");
41
public static Tokenizer GPT5_3 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5.3");
42
public static Tokenizer Phi4 { get; } =
TiktokenTokenizer
.CreateForModel("phi-4");
43
public static
TiktokenTokenizer
GptOss { get; } =
TiktokenTokenizer
.CreateForModel("gpt-oss-20b");
51
Assert.True(GPT4 is
TiktokenTokenizer
);
52
IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as
TiktokenTokenizer
)!.SpecialTokens;
56
string assemblyName = typeof(
TiktokenTokenizer
).Assembly.FullName!;
67
Tokenizer tokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
72
tokenizer =
TiktokenTokenizer
.Create(stream, GPT4.PreTokenizer, null, specialTokens);
76
tokenizer = await
TiktokenTokenizer
.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
81
tokenizer = await
TiktokenTokenizer
.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
87
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4", stream);
93
tokenizer = await
TiktokenTokenizer
.CreateForModelAsync("gpt-3.5-turbo", stream);
97
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
124
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
125
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
149
TestDecodingWithSpan((tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
166
private void TestDecodingWithSpan(
TiktokenTokenizer
tokenizer, int[] ids, string expectedDecoded)
202
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
233
TestDecodingWithSpan((gpt4Tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
245
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
280
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
292
foreach (
TiktokenTokenizer
tokenizer in new[] { GPT4o, GptOss, GPT5, GPT5_1, GPT5_2 })
345
TestDecodingWithSpan((GPT2 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
364
TestDecodingWithSpan((P50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
383
TestDecodingWithSpan((P50kEdit as
TiktokenTokenizer
)!, encoded.ToArray(), text);
402
TestDecodingWithSpan((R50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
486
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(modelName);
487
Assert.True(tokenizer is
TiktokenTokenizer
);
500
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForEncoding(encodingName);
501
Assert.True(tokenizer is
TiktokenTokenizer
);
515
Tokenizer tokenizer1 =
TiktokenTokenizer
.CreateForModel(modelName);
517
Assert.True(tokenizer is
TiktokenTokenizer
);
518
Assert.True(tokenizer1 is
TiktokenTokenizer
);
520
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
521
TiktokenTokenizer
tiktoken1 = (tokenizer1 as
TiktokenTokenizer
)!;
532
Assert.Throws<ArgumentNullException>(() =>
TiktokenTokenizer
.CreateForEncoding(null!));
533
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("r50k_base_"));
534
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_base_"));
535
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_edit_"));
536
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("cl100k_base_"));
537
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_base_"));
538
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_harmony_"));
565
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(name);
566
Assert.True(tokenizer is
TiktokenTokenizer
);
570
int entriesCount = GetEncoder((tokenizer as
TiktokenTokenizer
)!)!.Count;
847
private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(
TiktokenTokenizer
tiktoken)
848
=> typeof(
TiktokenTokenizer
).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
850
private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(
TiktokenTokenizer
tiktoken)
851
=> typeof(
TiktokenTokenizer
).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
853
private static IReadOnlyDictionary<string, int>? GetVocabulary(
TiktokenTokenizer
tiktoken)
854
=> typeof(
TiktokenTokenizer
).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;