5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1220
return new
TiktokenTokenizer
(
1245
=> new
TiktokenTokenizer
(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1262
=> new
TiktokenTokenizer
(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1290
return new
TiktokenTokenizer
(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1351
return new
TiktokenTokenizer
(vocabStream,
79 references to TiktokenTokenizer
Microsoft.ML.GenAI.LLaMA (3)
LlamaTokenizerHelper.cs (3)
43
/// Create <see cref="
TiktokenTokenizer
"/> from tokenizer model file.
47
public static
TiktokenTokenizer
FromPretrained(
53
return
TiktokenTokenizer
.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
Microsoft.ML.GenAI.LLaMA.Tests (1)
LLaMA3_1Tests.cs (1)
67
var
tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
Microsoft.ML.GenAI.Samples (2)
Llama\LLaMA3_1.cs (2)
37
var
tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
40
var pipeline = new CausalLMPipeline<
TiktokenTokenizer
, LlamaForCausalLM>(tokenizer, model, device);
Microsoft.ML.Tokenizers (17)
Model\CodeGenTokenizer.cs (1)
1895
new TiktokenPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenAddedTokens),
Model\Phi2Tokenizer.cs (1)
116
vocabStream, mergesStream, new TiktokenPreTokenizer(
TiktokenTokenizer
.P50kBaseRegex(), CodeGenTokenizer.CodeGenAddedTokens), normalizer: null,
Model\TiktokenTokenizer.cs (13)
1192
private static
TiktokenTokenizer
CreateForModel(
1239
public static
TiktokenTokenizer
Create(
1256
public static
TiktokenTokenizer
Create(
1274
public static async Task<
TiktokenTokenizer
> CreateAsync(
1303
public static async Task<
TiktokenTokenizer
> CreateAsync(
1329
public static
TiktokenTokenizer
CreateForModel(
1368
public static async Task<
TiktokenTokenizer
> CreateForModelAsync(
1405
public static
TiktokenTokenizer
CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1415
public static
TiktokenTokenizer
CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1445
throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {
TiktokenTokenizer
.Cl100kBaseEncodingName}, {
TiktokenTokenizer
.P50kBaseEncodingName}, {
TiktokenTokenizer
.P50kEditEncodingName}, and {
TiktokenTokenizer
.R50kBaseEncodingName}.", nameof(encodingName));
PreTokenizer\RobertaPreTokenizer.cs (2)
32
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
47
return SplitText(text,
TiktokenTokenizer
.P50kBaseRegex());
Microsoft.ML.Tokenizers.Tests (56)
TitokenTests.cs (56)
31
public static Tokenizer GPT4 { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4", _specialTokens);
32
public static Tokenizer GPT2 { get; } =
TiktokenTokenizer
.CreateForModel("gpt2");
33
public static Tokenizer P50kBase { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-003");
34
public static Tokenizer R50kBase { get; } =
TiktokenTokenizer
.CreateForModel("ada");
35
public static Tokenizer P50kEdit { get; } =
TiktokenTokenizer
.CreateForModel("text-davinci-edit-001");
36
public static Tokenizer GPT4o { get; } =
TiktokenTokenizer
.CreateForModel("gpt-4o");
43
Assert.True(GPT4 is
TiktokenTokenizer
);
44
IReadOnlyDictionary<string, int>? specialTokensEncoder = (GPT4 as
TiktokenTokenizer
)!.SpecialTokens;
58
Tokenizer tokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokensEncoder);
63
tokenizer =
TiktokenTokenizer
.Create(stream, GPT4.PreTokenizer, null, specialTokensEncoder);
67
tokenizer = await
TiktokenTokenizer
.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokensEncoder);
72
tokenizer = await
TiktokenTokenizer
.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokensEncoder);
78
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4", stream);
84
tokenizer = await
TiktokenTokenizer
.CreateForModelAsync("gpt-3.5-turbo", stream);
88
tokenizer =
TiktokenTokenizer
.CreateForModel("gpt-4");
115
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
116
TiktokenTokenizer
externalTokenizer =
TiktokenTokenizer
.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
140
TestDecodingWithSpan((tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
157
private void TestDecodingWithSpan(
TiktokenTokenizer
tokenizer, int[] ids, string expectedDecoded)
193
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
224
TestDecodingWithSpan((gpt4Tokenizer as
TiktokenTokenizer
)!, encoded.ToArray(), text);
236
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
271
TestDecodingWithSpan((GPT4 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
297
TestDecodingWithSpan((GPT4o as
TiktokenTokenizer
)!, encoded.ToArray(), text);
305
TestDecodingWithSpan((GPT4o as
TiktokenTokenizer
)!, encoded.ToArray(), text);
333
TestDecodingWithSpan((GPT2 as
TiktokenTokenizer
)!, encoded.ToArray(), text);
352
TestDecodingWithSpan((P50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
371
TestDecodingWithSpan((P50kEdit as
TiktokenTokenizer
)!, encoded.ToArray(), text);
390
TestDecodingWithSpan((R50kBase as
TiktokenTokenizer
)!, encoded.ToArray(), text);
441
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(modelName);
442
Assert.True(tokenizer is
TiktokenTokenizer
);
454
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForEncoding(encodingName);
455
Assert.True(tokenizer is
TiktokenTokenizer
);
468
Tokenizer tokenizer1 =
TiktokenTokenizer
.CreateForModel(modelName);
470
Assert.True(tokenizer is
TiktokenTokenizer
);
471
Assert.True(tokenizer1 is
TiktokenTokenizer
);
473
TiktokenTokenizer
tiktoken = (tokenizer as
TiktokenTokenizer
)!;
474
TiktokenTokenizer
tiktoken1 = (tokenizer1 as
TiktokenTokenizer
)!;
485
Assert.Throws<ArgumentNullException>(() =>
TiktokenTokenizer
.CreateForEncoding(null!));
486
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("r50k_base_"));
487
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_base_"));
488
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("p50k_edit_"));
489
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("cl100k_base_"));
490
Assert.Throws<ArgumentException>(() =>
TiktokenTokenizer
.CreateForEncoding("o200k_base_"));
507
Tokenizer tokenizer =
TiktokenTokenizer
.CreateForModel(name);
508
Assert.True(tokenizer is
TiktokenTokenizer
);
729
private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(
TiktokenTokenizer
tiktoken)
730
=> typeof(
TiktokenTokenizer
).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
732
private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(
TiktokenTokenizer
tiktoken)
733
=> typeof(
TiktokenTokenizer
).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
735
private static IReadOnlyDictionary<string, int>? GetVocabulary(
TiktokenTokenizer
tiktoken)
736
=> typeof(
TiktokenTokenizer
).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;