5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1220return new TiktokenTokenizer( 1245=> new TiktokenTokenizer(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize); 1262=> new TiktokenTokenizer(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize); 1290return new TiktokenTokenizer(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize); 1351return new TiktokenTokenizer(vocabStream,
79 references to TiktokenTokenizer
Microsoft.ML.GenAI.LLaMA (3)
LlamaTokenizerHelper.cs (3)
43/// Create <see cref="TiktokenTokenizer"/> from tokenizer model file. 47public static TiktokenTokenizer FromPretrained( 53return TiktokenTokenizer.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
Microsoft.ML.GenAI.LLaMA.Tests (1)
LLaMA3_1Tests.cs (1)
67var tokenizer = LlamaTokenizerHelper.FromPretrained(modelWeightFolder);
Microsoft.ML.GenAI.Samples (2)
Llama\LLaMA3_1.cs (2)
37var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder); 40var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
Microsoft.ML.Tokenizers (17)
Model\CodeGenTokenizer.cs (1)
1895new TiktokenPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenAddedTokens),
Model\Phi2Tokenizer.cs (1)
116vocabStream, mergesStream, new TiktokenPreTokenizer(TiktokenTokenizer.P50kBaseRegex(), CodeGenTokenizer.CodeGenAddedTokens), normalizer: null,
Model\TiktokenTokenizer.cs (13)
1192private static TiktokenTokenizer CreateForModel( 1239public static TiktokenTokenizer Create( 1256public static TiktokenTokenizer Create( 1274public static async Task<TiktokenTokenizer> CreateAsync( 1303public static async Task<TiktokenTokenizer> CreateAsync( 1329public static TiktokenTokenizer CreateForModel( 1368public static async Task<TiktokenTokenizer> CreateForModelAsync( 1405public static TiktokenTokenizer CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null) 1415public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null) 1445throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {TiktokenTokenizer.Cl100kBaseEncodingName}, {TiktokenTokenizer.P50kBaseEncodingName}, {TiktokenTokenizer.P50kEditEncodingName}, and {TiktokenTokenizer.R50kBaseEncodingName}.", nameof(encodingName));
PreTokenizer\RobertaPreTokenizer.cs (2)
32return SplitText(text, TiktokenTokenizer.P50kBaseRegex()); 47return SplitText(text, TiktokenTokenizer.P50kBaseRegex());
Microsoft.ML.Tokenizers.Tests (56)
TitokenTests.cs (56)
31public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens); 32public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2"); 33public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003"); 34public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada"); 35public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001"); 36public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o"); 43Assert.True(GPT4 is TiktokenTokenizer); 44IReadOnlyDictionary<string, int>? specialTokensEncoder = (GPT4 as TiktokenTokenizer)!.SpecialTokens; 58Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokensEncoder); 63tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokensEncoder); 67tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokensEncoder); 72tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokensEncoder); 78tokenizer = TiktokenTokenizer.CreateForModel("gpt-4", stream); 84tokenizer = await TiktokenTokenizer.CreateForModelAsync("gpt-3.5-turbo", stream); 88tokenizer = TiktokenTokenizer.CreateForModel("gpt-4"); 115TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!; 116TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens); 140TestDecodingWithSpan((tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text); 157private void TestDecodingWithSpan(TiktokenTokenizer tokenizer, int[] ids, string expectedDecoded) 193TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 224TestDecodingWithSpan((gpt4Tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text); 236TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 271TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text); 297TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text); 305TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text); 333TestDecodingWithSpan((GPT2 as TiktokenTokenizer)!, encoded.ToArray(), text); 352TestDecodingWithSpan((P50kBase as TiktokenTokenizer)!, encoded.ToArray(), text); 371TestDecodingWithSpan((P50kEdit as TiktokenTokenizer)!, encoded.ToArray(), text); 390TestDecodingWithSpan((R50kBase as TiktokenTokenizer)!, encoded.ToArray(), text); 441Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName); 442Assert.True(tokenizer is TiktokenTokenizer); 454Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName); 455Assert.True(tokenizer is TiktokenTokenizer); 468Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName); 470Assert.True(tokenizer is TiktokenTokenizer); 471Assert.True(tokenizer1 is TiktokenTokenizer); 473TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!; 474TiktokenTokenizer tiktoken1 = (tokenizer1 as TiktokenTokenizer)!; 485Assert.Throws<ArgumentNullException>(() => TiktokenTokenizer.CreateForEncoding(null!)); 486Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("r50k_base_")); 487Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_base_")); 488Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_edit_")); 489Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("cl100k_base_")); 490Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_base_")); 507Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name); 508Assert.True(tokenizer is TiktokenTokenizer); 729private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken) 730=> typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>; 732private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(TiktokenTokenizer tiktoken) 733=> typeof(TiktokenTokenizer).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>; 735private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken) 736=> typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;