5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1236
return new
TiktokenTokenizer
(
1264
=> new
TiktokenTokenizer
(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1284
=> new
TiktokenTokenizer
(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1315
return new
TiktokenTokenizer
(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1379
return new
TiktokenTokenizer
(vocabStream,
89 references to TiktokenTokenizer
Microsoft.ML.GenAI.LLaMA (3)
LlamaTokenizerHelper.cs (3)
43
/// Create <see cref="
TiktokenTokenizer
"/> from tokenizer model file.
47
public static
TiktokenTokenizer
FromPretrained(
53
return
TiktokenTokenizer
.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
Microsoft.ML.GenAI.LLaMA.Tests (1)
LLaMA3_1Tests.cs (1)
68
var
tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
Microsoft.ML.GenAI.Samples (7)
Llama\LlamaSample.cs (2)
36
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
39
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Llama\SFT_Llama_3_2_1B.cs (3)
81
public static ICausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM> LoadModel(string weightFolder, string checkPointName = "model.safetensors.index.json")
91
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
94
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
MEAI\Llama3_1.cs (2)
37
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
40
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Microsoft.ML.Tokenizers (17)
Model\CodeGenTokenizer.cs (1)
1897
new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens),
Model\Phi2Tokenizer.cs (1)
117
vocabStream, mergesStream, new RegexPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenSpecialTokens), normalizer: null,
Model\TiktokenTokenizer.cs (13)
1203
private static
TiktokenTokenizer
CreateForModel(
1258
public static
TiktokenTokenizer
Create(
1278
public static
TiktokenTokenizer
Create(
1299
public static async Task<
TiktokenTokenizer
> CreateAsync(
1331
public static async Task<
TiktokenTokenizer
> CreateAsync(
1357
public static
TiktokenTokenizer
CreateForModel(
1396
public static async Task<
TiktokenTokenizer
> CreateForModelAsync(
1433
public static
TiktokenTokenizer
CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1443
public static
TiktokenTokenizer
CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1473
throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {
TiktokenTokenizer
.Cl100kBaseEncodingName}, {
TiktokenTokenizer
.P50kBaseEncodingName}, {
TiktokenTokenizer
.P50kEditEncodingName}, and {
TiktokenTokenizer
.R50kBaseEncodingName}.", nameof(encodingName));
PreTokenizer\RobertaPreTokenizer.cs (2)
32
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
47
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
Microsoft.ML.Tokenizers.Data.Tests (3)
TokenizerDataTests.cs (3)
30
var exception = Record.Exception(() =>
TiktokenTokenizer
.CreateForModel(modelName));
53
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
Microsoft.ML.Tokenizers.Tests (58)
TiktokenTests.cs (58)
31
public static Tokenizer GPT4 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4", _specialTokens);
32
public static Tokenizer GPT2 { get; } =
TiktokenTokenizer
.CreateForModel("gpt2");
33
public static Tokenizer P50kBase { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-003");
34
public static Tokenizer R50kBase { get; } =
TiktokenTokenizer
.CreateForModel("ada");
35
public static Tokenizer P50kEdit { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-edit-001");
36
public static Tokenizer GPT4o { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4o");
43
Assert.True(GPT4 is
TiktokenTokenizer
);
44
IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as
TiktokenTokenizer
)!.SpecialTokens;
48
string assemblyName = typeof(
TiktokenTokenizer
).Assembly.FullName!;
59
Tokenizer tokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
64
tokenizer =
TiktokenTokenizer
.Create(stream, GPT4.PreTokenizer, null, specialTokens);
68
tokenizer = await
TiktokenTokenizer
.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
73
tokenizer = await
TiktokenTokenizer
.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
79
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4", stream);
85
tokenizer = await
TiktokenTokenizer
.CreateForModelAsync("gpt-3.5-turbo", stream);
89
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
116
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
117
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
141
TestDecodingWithSpan((tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
158
private void TestDecodingWithSpan(
TiktokenTokenizer
tokenizer, int[] ids, string expectedDecoded)
194
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
225
TestDecodingWithSpan((gpt4Tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
237
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
272
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
298
TestDecodingWithSpan((GPT4o as
TiktokenTokenizer
)!, encoded.ToArray(), text);
306
TestDecodingWithSpan((GPT4o as
TiktokenTokenizer
)!, encoded.ToArray(), text);
334
TestDecodingWithSpan((GPT2 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
353
TestDecodingWithSpan((P50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
372
TestDecodingWithSpan((P50kEdit as
TiktokenTokenizer
)!, encoded.ToArray(), text);
391
TestDecodingWithSpan((R50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
444
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(modelName);
445
Assert.True(tokenizer is
TiktokenTokenizer
);
457
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForEncoding(encodingName);
458
Assert.True(tokenizer is
TiktokenTokenizer
);
471
Tokenizer tokenizer1 =
TiktokenTokenizer
.CreateForModel(modelName);
473
Assert.True(tokenizer is
TiktokenTokenizer
);
474
Assert.True(tokenizer1 is
TiktokenTokenizer
);
476
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
477
TiktokenTokenizer
tiktoken1 = (tokenizer1 as
TiktokenTokenizer
)!;
488
Assert.Throws<ArgumentNullException>(() =>
TiktokenTokenizer
.CreateForEncoding(null!));
489
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("r50k_base_"));
490
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_base_"));
491
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_edit_"));
492
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("cl100k_base_"));
493
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_base_"));
511
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(name);
512
Assert.True(tokenizer is
TiktokenTokenizer
);
516
int entriesCount = GetEncoder((tokenizer as
TiktokenTokenizer
)!)!.Count;
733
private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(
TiktokenTokenizer
tiktoken)
734
=> typeof(
TiktokenTokenizer
).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
736
private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(
TiktokenTokenizer
tiktoken)
737
=> typeof(
TiktokenTokenizer
).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
739
private static IReadOnlyDictionary<string, int>? GetVocabulary(
TiktokenTokenizer
tiktoken)
740
=> typeof(
TiktokenTokenizer
).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;