5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1318
return new
TiktokenTokenizer
(
1346
=> new
TiktokenTokenizer
(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1366
=> new
TiktokenTokenizer
(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1397
return new
TiktokenTokenizer
(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1461
return new
TiktokenTokenizer
(vocabStream,
107 references to TiktokenTokenizer
Microsoft.Extensions.AI.Integration.Tests (1)
ReducingChatClientTests.cs (1)
21
private static readonly Tokenizer _gpt4oTokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Microsoft.Extensions.DataIngestion.Tests (13)
Chunkers\ChunkerOptionsTests.cs (1)
12
private static readonly Tokenizer _tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
Chunkers\HeaderChunkerTests.cs (8)
39
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")));
71
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 13 });
96
HeaderChunker lessThanContext = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 5 });
99
HeaderChunker sameAsContext = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 6 });
119
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 30 });
136
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 37 });
146
HeaderChunker chunker = new(new(
TiktokenTokenizer
.CreateForModel("gpt-4")) { MaxTokensPerChunk = 100 });
175
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
Chunkers\SectionChunkerTests.cs (2)
16
var
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
Chunkers\SemanticSimilarityChunkerTests.cs (1)
26
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4o");
IngestionPipelineTests.cs (1)
226
private static IngestionChunker<string> CreateChunker() => new HeaderChunker(new(
TiktokenTokenizer
.CreateForModel("gpt-4")));
Microsoft.ML.GenAI.LLaMA (3)
LlamaTokenizerHelper.cs (3)
43
/// Create <see cref="
TiktokenTokenizer
"/> from tokenizer model file.
47
public static
TiktokenTokenizer
FromPretrained(
53
return
TiktokenTokenizer
.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
Microsoft.ML.GenAI.LLaMA.Tests (1)
LLaMA3_1Tests.cs (1)
68
var
tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
Microsoft.ML.GenAI.Samples (7)
Llama\LlamaSample.cs (2)
36
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
39
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Llama\SFT_Llama_3_2_1B.cs (3)
81
public static ICausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM> LoadModel(string weightFolder, string checkPointName = "model.safetensors.index.json")
91
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
94
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
MEAI\Llama3_1.cs (2)
37
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
40
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Microsoft.ML.Tokenizers (17)
Model\CodeGenTokenizer.cs (1)
1879
new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117
vocabStream, mergesStream, new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (13)
1285
private static
TiktokenTokenizer
CreateForModel(
1340
public static
TiktokenTokenizer
Create(
1360
public static
TiktokenTokenizer
Create(
1381
public static async Task<
TiktokenTokenizer
> CreateAsync(
1413
public static async Task<
TiktokenTokenizer
> CreateAsync(
1439
public static
TiktokenTokenizer
CreateForModel(
1478
public static async Task<
TiktokenTokenizer
> CreateForModelAsync(
1515
public static
TiktokenTokenizer
CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1525
public static
TiktokenTokenizer
CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1559
throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {
TiktokenTokenizer
.Cl100kBaseEncodingName}, {
TiktokenTokenizer
.P50kBaseEncodingName}, {
TiktokenTokenizer
.P50kEditEncodingName}, and {
TiktokenTokenizer
.R50kBaseEncodingName}.", nameof(encodingName));
PreTokenizer\RobertaPreTokenizer.cs (2)
32
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
47
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
Microsoft.ML.Tokenizers.Data.Tests (3)
TokenizerDataTests.cs (3)
30
var exception = Record.Exception(() =>
TiktokenTokenizer
.CreateForModel(modelName));
53
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
Microsoft.ML.Tokenizers.Tests (62)
TiktokenTests.cs (62)
32
public static Tokenizer GPT4 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4", _specialTokens);
33
public static Tokenizer GPT2 { get; } =
TiktokenTokenizer
.CreateForModel("gpt2");
34
public static Tokenizer P50kBase { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-003");
35
public static Tokenizer R50kBase { get; } =
TiktokenTokenizer
.CreateForModel("ada");
36
public static Tokenizer P50kEdit { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-edit-001");
37
public static Tokenizer GPT4o { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4o");
38
public static Tokenizer GPT5 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-5");
39
public static Tokenizer Phi4 { get; } =
TiktokenTokenizer
.CreateForModel("phi-4");
40
public static
TiktokenTokenizer
GptOss { get; } =
TiktokenTokenizer
.CreateForModel("gpt-oss-20b");
48
Assert.True(GPT4 is
TiktokenTokenizer
);
49
IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as
TiktokenTokenizer
)!.SpecialTokens;
53
string assemblyName = typeof(
TiktokenTokenizer
).Assembly.FullName!;
64
Tokenizer tokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
69
tokenizer =
TiktokenTokenizer
.Create(stream, GPT4.PreTokenizer, null, specialTokens);
73
tokenizer = await
TiktokenTokenizer
.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
78
tokenizer = await
TiktokenTokenizer
.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
84
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4", stream);
90
tokenizer = await
TiktokenTokenizer
.CreateForModelAsync("gpt-3.5-turbo", stream);
94
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
121
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
122
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
146
TestDecodingWithSpan((tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
163
private void TestDecodingWithSpan(
TiktokenTokenizer
tokenizer, int[] ids, string expectedDecoded)
199
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
230
TestDecodingWithSpan((gpt4Tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
242
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
277
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
289
foreach (
TiktokenTokenizer
tokenizer in new[] { GPT4o, GptOss, GPT5 })
342
TestDecodingWithSpan((GPT2 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
361
TestDecodingWithSpan((P50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
380
TestDecodingWithSpan((P50kEdit as
TiktokenTokenizer
)!, encoded.ToArray(), text);
399
TestDecodingWithSpan((R50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
477
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(modelName);
478
Assert.True(tokenizer is
TiktokenTokenizer
);
491
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForEncoding(encodingName);
492
Assert.True(tokenizer is
TiktokenTokenizer
);
506
Tokenizer tokenizer1 =
TiktokenTokenizer
.CreateForModel(modelName);
508
Assert.True(tokenizer is
TiktokenTokenizer
);
509
Assert.True(tokenizer1 is
TiktokenTokenizer
);
511
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
512
TiktokenTokenizer
tiktoken1 = (tokenizer1 as
TiktokenTokenizer
)!;
523
Assert.Throws<ArgumentNullException>(() =>
TiktokenTokenizer
.CreateForEncoding(null!));
524
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("r50k_base_"));
525
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_base_"));
526
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_edit_"));
527
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("cl100k_base_"));
528
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_base_"));
529
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_harmony_"));
553
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(name);
554
Assert.True(tokenizer is
TiktokenTokenizer
);
558
int entriesCount = GetEncoder((tokenizer as
TiktokenTokenizer
)!)!.Count;
835
private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(
TiktokenTokenizer
tiktoken)
836
=> typeof(
TiktokenTokenizer
).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
838
private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(
TiktokenTokenizer
tiktoken)
839
=> typeof(
TiktokenTokenizer
).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
841
private static IReadOnlyDictionary<string, int>? GetVocabulary(
TiktokenTokenizer
tiktoken)
842
=> typeof(
TiktokenTokenizer
).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;