5 instantiations of TiktokenTokenizer
Microsoft.ML.Tokenizers (5)
Model\TiktokenTokenizer.cs (5)
1236return new TiktokenTokenizer(
1264=> new TiktokenTokenizer(vocabFilePath, preTokenizer, specialTokens, normalizer, cacheSize);
1284=> new TiktokenTokenizer(vocabStream, preTokenizer, specialTokens, normalizer, cacheSize);
1315return new TiktokenTokenizer(encoder, decoder, vocab, preTokenizer, specialTokens, normalizer, cacheSize);
1379return new TiktokenTokenizer(vocabStream,
89 references to TiktokenTokenizer
Microsoft.Extensions.AI.Integration.Tests (1)
Microsoft.ML.GenAI.LLaMA (3)
Microsoft.ML.GenAI.LLaMA.Tests (1)
Microsoft.ML.GenAI.Samples (7)
Microsoft.ML.Tokenizers (17)
Model\TiktokenTokenizer.cs (13)
1203private static TiktokenTokenizer CreateForModel(
1258public static TiktokenTokenizer Create(
1278public static TiktokenTokenizer Create(
1299public static async Task<TiktokenTokenizer> CreateAsync(
1331public static async Task<TiktokenTokenizer> CreateAsync(
1357public static TiktokenTokenizer CreateForModel(
1396public static async Task<TiktokenTokenizer> CreateForModelAsync(
1433public static TiktokenTokenizer CreateForModel(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1443public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null, Normalizer? normalizer = null)
1473throw new ArgumentException($"The encoding name '{encodingName}' is not supported. The only supported encoding names are: {TiktokenTokenizer.Cl100kBaseEncodingName}, {TiktokenTokenizer.P50kBaseEncodingName}, {TiktokenTokenizer.P50kEditEncodingName}, and {TiktokenTokenizer.R50kBaseEncodingName}.", nameof(encodingName));
Microsoft.ML.Tokenizers.Data.Tests (3)
Microsoft.ML.Tokenizers.Tests (57)
TiktokenTests.cs (57)
31public static Tokenizer GPT4 { get; } = TiktokenTokenizer.CreateForModel("gpt-4", _specialTokens);
32public static Tokenizer GPT2 { get; } = TiktokenTokenizer.CreateForModel("gpt2");
33public static Tokenizer P50kBase { get; } = TiktokenTokenizer.CreateForModel("text-davinci-003");
34public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada");
35public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
36public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
43Assert.True(GPT4 is TiktokenTokenizer);
44IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
48string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
59Tokenizer tokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, GPT4.PreTokenizer, null, specialTokens);
64tokenizer = TiktokenTokenizer.Create(stream, GPT4.PreTokenizer, null, specialTokens);
68tokenizer = await TiktokenTokenizer.CreateAsync(tokenizerDataFileName, GPT4.PreTokenizer, normalizer: null, specialTokens);
73tokenizer = await TiktokenTokenizer.CreateAsync(stream, GPT4.PreTokenizer, normalizer: null, specialTokens);
79tokenizer = TiktokenTokenizer.CreateForModel("gpt-4", stream);
85tokenizer = await TiktokenTokenizer.CreateForModelAsync("gpt-3.5-turbo", stream);
89tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
116TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
117TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, tokenizer.PreTokenizer, null, tiktoken.SpecialTokens);
141TestDecodingWithSpan((tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
158private void TestDecodingWithSpan(TiktokenTokenizer tokenizer, int[] ids, string expectedDecoded)
194TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
225TestDecodingWithSpan((gpt4Tokenizer as TiktokenTokenizer)!, encoded.ToArray(), text);
237TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
272TestDecodingWithSpan((GPT4 as TiktokenTokenizer)!, encoded.ToArray(), text);
298TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text);
306TestDecodingWithSpan((GPT4o as TiktokenTokenizer)!, encoded.ToArray(), text);
334TestDecodingWithSpan((GPT2 as TiktokenTokenizer)!, encoded.ToArray(), text);
353TestDecodingWithSpan((P50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
372TestDecodingWithSpan((P50kEdit as TiktokenTokenizer)!, encoded.ToArray(), text);
391TestDecodingWithSpan((R50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
444Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName);
445Assert.True(tokenizer is TiktokenTokenizer);
457Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding(encodingName);
458Assert.True(tokenizer is TiktokenTokenizer);
471Tokenizer tokenizer1 = TiktokenTokenizer.CreateForModel(modelName);
473Assert.True(tokenizer is TiktokenTokenizer);
474Assert.True(tokenizer1 is TiktokenTokenizer);
476TiktokenTokenizer tiktoken = (tokenizer as TiktokenTokenizer)!;
477TiktokenTokenizer tiktoken1 = (tokenizer1 as TiktokenTokenizer)!;
488Assert.Throws<ArgumentNullException>(() => TiktokenTokenizer.CreateForEncoding(null!));
489Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("r50k_base_"));
490Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_base_"));
491Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("p50k_edit_"));
492Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("cl100k_base_"));
493Assert.Throws<ArgumentException>(() => TiktokenTokenizer.CreateForEncoding("o200k_base_"));
511Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(name);
512Assert.True(tokenizer is TiktokenTokenizer);
733private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken)
734=> typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
736private static IReadOnlyDictionary<int, ReadOnlyMemory<byte>>? GetDecoder(TiktokenTokenizer tiktoken)
737=> typeof(TiktokenTokenizer).GetProperty("Decoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<int, ReadOnlyMemory<byte>>;
739private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken)
740=> typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;